SUPIR

Running on Zero

App Files Files Community

Fabrice-TIERCELIN commited on 25 days ago

Commit

67a6ee7

verified ·

1 Parent(s): 45c8dfa

Upload 6 files

Browse files

Files changed (6) hide show

packages/ltx-core/src/ltx_core/components/__init__.py +10 -0
packages/ltx-core/src/ltx_core/components/guiders.py +198 -0
packages/ltx-core/src/ltx_core/components/noisers.py +35 -0
packages/ltx-core/src/ltx_core/components/patchifiers.py +348 -0
packages/ltx-core/src/ltx_core/components/protocols.py +101 -0
packages/ltx-core/src/ltx_core/components/schedulers.py +129 -0

packages/ltx-core/src/ltx_core/components/__init__.py ADDED Viewed

	@@ -0,0 +1,10 @@

+"""
+Diffusion pipeline components.
+Submodules:
+    diffusion_steps - Diffusion stepping algorithms (EulerDiffusionStep)
+    guiders         - Guidance strategies (CFGGuider, STGGuider, APG variants)
+    noisers         - Noise samplers (GaussianNoiser)
+    patchifiers     - Latent patchification (VideoLatentPatchifier, AudioPatchifier)
+    protocols       - Protocol definitions (Patchifier, etc.)
+    schedulers      - Sigma schedulers (LTX2Scheduler, LinearQuadraticScheduler)
+"""

packages/ltx-core/src/ltx_core/components/guiders.py ADDED Viewed

	@@ -0,0 +1,198 @@

+from dataclasses import dataclass
+import torch
+from ltx_core.components.protocols import GuiderProtocol
+@dataclass(frozen=True)
+class CFGGuider(GuiderProtocol):
+    """
+    Classifier-free guidance (CFG) guider.
+    Computes the guidance delta as (scale - 1) * (cond - uncond), steering the
+    denoising process toward the conditioned prediction.
+    Attributes:
+        scale: Guidance strength. 1.0 means no guidance, higher values increase
+            adherence to the conditioning.
+    """
+    scale: float
+    def delta(self, cond: torch.Tensor, uncond: torch.Tensor) -> torch.Tensor:
+        return (self.scale - 1) * (cond - uncond)
+    def enabled(self) -> bool:
+        return self.scale != 1.0
+@dataclass(frozen=True)
+class CFGStarRescalingGuider(GuiderProtocol):
+    """
+    Calculates the CFG delta between conditioned and unconditioned samples.
+    To minimize offset in the denoising direction and move mostly along the
+    conditioning axis within the distribution, the unconditioned sample is
+    rescaled in accordance with the norm of the conditioned sample.
+    Attributes:
+        scale (float):
+            Global guidance strength. A value of 1.0 corresponds to no extra
+            guidance beyond the base model prediction. Values > 1.0 increase
+            the influence of the conditioned sample relative to the
+            unconditioned one.
+    """
+    scale: float
+    def delta(self, cond: torch.Tensor, uncond: torch.Tensor) -> torch.Tensor:
+        rescaled_neg = projection_coef(cond, uncond) * uncond
+        return (self.scale - 1) * (cond - rescaled_neg)
+    def enabled(self) -> bool:
+        return self.scale != 1.0
+@dataclass(frozen=True)
+class STGGuider(GuiderProtocol):
+    """
+    Calculates the STG delta between conditioned and perturbed denoised samples.
+    Perturbed samples are the result of the denoising process with perturbations,
+    e.g. attentions acting as passthrough for certain layers and modalities.
+    Attributes:
+        scale (float):
+            Global strength of the STG guidance. A value of 0.0 disables the
+            guidance. Larger values increase the correction applied in the
+            direction of (pos_denoised - perturbed_denoised).
+    """
+    scale: float
+    def delta(self, pos_denoised: torch.Tensor, perturbed_denoised: torch.Tensor) -> torch.Tensor:
+        return self.scale * (pos_denoised - perturbed_denoised)
+    def enabled(self) -> bool:
+        return self.scale != 0.0
+@dataclass(frozen=True)
+class LtxAPGGuider(GuiderProtocol):
+    """
+    Calculates the APG (adaptive projected guidance) delta between conditioned
+    and unconditioned samples.
+    To minimize offset in the denoising direction and move mostly along the
+    conditioning axis within the distribution, the (cond - uncond) delta is
+    decomposed into components parallel and orthogonal to the conditioned
+    sample. The `eta` parameter weights the parallel component, while `scale`
+    is applied to the orthogonal component. Optionally, a norm threshold can
+    be used to suppress guidance when the magnitude of the correction is small.
+    Attributes:
+        scale (float):
+            Strength applied to the component of the guidance that is orthogonal
+            to the conditioned sample. Controls how aggressively we move in
+            directions that change semantics but stay consistent with the
+            conditioning manifold.
+        eta (float):
+            Weight of the component of the guidance that is parallel to the
+            conditioned sample. A value of 1.0 keeps the full parallel
+            component; values in [0, 1] attenuate it, and values > 1.0 amplify
+            motion along the conditioning direction.
+        norm_threshold (float):
+            Minimum L2 norm of the guidance delta below which the guidance
+            can be reduced or ignored (depending on implementation).
+            This is useful for avoiding noisy or unstable updates when the
+            guidance signal is very small.
+    """
+    scale: float
+    eta: float = 1.0
+    norm_threshold: float = 0.0
+    def delta(self, cond: torch.Tensor, uncond: torch.Tensor) -> torch.Tensor:
+        guidance = cond - uncond
+        if self.norm_threshold > 0:
+            ones = torch.ones_like(guidance)
+            guidance_norm = guidance.norm(p=2, dim=[-1, -2, -3], keepdim=True)
+            scale_factor = torch.minimum(ones, self.norm_threshold / guidance_norm)
+            guidance = guidance * scale_factor
+        proj_coeff = projection_coef(guidance, cond)
+        g_parallel = proj_coeff * cond
+        g_orth = guidance - g_parallel
+        g_apg = g_parallel * self.eta + g_orth
+        return g_apg * (self.scale - 1)
+    def enabled(self) -> bool:
+        return self.scale != 1.0
+@dataclass(frozen=False)
+class LegacyStatefulAPGGuider(GuiderProtocol):
+    """
+    Calculates the APG (adaptive projected guidance) delta between conditioned
+    and unconditioned samples.
+    To minimize offset in the denoising direction and move mostly along the
+    conditioning axis within the distribution, the (cond - uncond) delta is
+    decomposed into components parallel and orthogonal to the conditioned
+    sample. The `eta` parameter weights the parallel component, while `scale`
+    is applied to the orthogonal component. Optionally, a norm threshold can
+    be used to suppress guidance when the magnitude of the correction is small.
+    Attributes:
+        scale (float):
+            Strength applied to the component of the guidance that is orthogonal
+            to the conditioned sample. Controls how aggressively we move in
+            directions that change semantics but stay consistent with the
+            conditioning manifold.
+        eta (float):
+            Weight of the component of the guidance that is parallel to the
+            conditioned sample. A value of 1.0 keeps the full parallel
+            component; values in [0, 1] attenuate it, and values > 1.0 amplify
+            motion along the conditioning direction.
+        norm_threshold (float):
+            Minimum L2 norm of the guidance delta below which the guidance
+            can be reduced or ignored (depending on implementation).
+            This is useful for avoiding noisy or unstable updates when the
+            guidance signal is very small.
+        momentum (float):
+            Exponential moving-average coefficient for accumulating guidance
+            over time. running_avg = momentum * running_avg + guidance
+    """
+    scale: float
+    eta: float
+    norm_threshold: float = 5.0
+    momentum: float = 0.0
+    # it is user's responsibility not to use same APGGuider for several denoisings or different modalities
+    # in order not to share accumulated average across different denoisings or modalities
+    running_avg: torch.Tensor | None = None
+    def delta(self, cond: torch.Tensor, uncond: torch.Tensor) -> torch.Tensor:
+        guidance = cond - uncond
+        if self.momentum != 0:
+            if self.running_avg is None:
+                self.running_avg = guidance.clone()
+            else:
+                self.running_avg = self.momentum * self.running_avg + guidance
+            guidance = self.running_avg
+        if self.norm_threshold > 0:
+            ones = torch.ones_like(guidance)
+            guidance_norm = guidance.norm(p=2, dim=[-1, -2, -3], keepdim=True)
+            scale_factor = torch.minimum(ones, self.norm_threshold / guidance_norm)
+            guidance = guidance * scale_factor
+        proj_coeff = projection_coef(guidance, cond)
+        g_parallel = proj_coeff * cond
+        g_orth = guidance - g_parallel
+        g_apg = g_parallel * self.eta + g_orth
+        return g_apg * self.scale
+    def enabled(self) -> bool:
+        return self.scale != 0.0
+def projection_coef(to_project: torch.Tensor, project_onto: torch.Tensor) -> torch.Tensor:
+    batch_size = to_project.shape[0]
+    positive_flat = to_project.reshape(batch_size, -1)
+    negative_flat = project_onto.reshape(batch_size, -1)
+    dot_product = torch.sum(positive_flat * negative_flat, dim=1, keepdim=True)
+    squared_norm = torch.sum(negative_flat**2, dim=1, keepdim=True) + 1e-8
+    return dot_product / squared_norm

packages/ltx-core/src/ltx_core/components/noisers.py ADDED Viewed

	@@ -0,0 +1,35 @@

+from dataclasses import replace
+from typing import Protocol
+import torch
+from ltx_core.types import LatentState
+class Noiser(Protocol):
+    """Protocol for adding noise to a latent state during diffusion."""
+    def __call__(self, latent_state: LatentState, noise_scale: float) -> LatentState: ...
+class GaussianNoiser(Noiser):
+    """Adds Gaussian noise to a latent state, scaled by the denoise mask."""
+    def __init__(self, generator: torch.Generator):
+        super().__init__()
+        self.generator = generator
+    def __call__(self, latent_state: LatentState, noise_scale: float = 1.0) -> LatentState:
+        noise = torch.randn(
+            *latent_state.latent.shape,
+            device=latent_state.latent.device,
+            dtype=latent_state.latent.dtype,
+            generator=self.generator,
+        )
+        scaled_mask = latent_state.denoise_mask * noise_scale
+        latent = noise * scaled_mask + latent_state.latent * (1 - scaled_mask)
+        return replace(
+            latent_state,
+            latent=latent.to(latent_state.latent.dtype),
+        )

packages/ltx-core/src/ltx_core/components/patchifiers.py ADDED Viewed

	@@ -0,0 +1,348 @@

+import math
+from typing import Optional, Tuple
+import einops
+import torch
+from ltx_core.components.protocols import Patchifier
+from ltx_core.types import AudioLatentShape, SpatioTemporalScaleFactors, VideoLatentShape
+class VideoLatentPatchifier(Patchifier):
+    def __init__(self, patch_size: int):
+        # Patch sizes for video latents.
+        self._patch_size = (
+            1,  # temporal dimension
+            patch_size,  # height dimension
+            patch_size,  # width dimension
+        )
+    @property
+    def patch_size(self) -> Tuple[int, int, int]:
+        return self._patch_size
+    def get_token_count(self, tgt_shape: VideoLatentShape) -> int:
+        return math.prod(tgt_shape.to_torch_shape()[2:]) // math.prod(self._patch_size)
+    def patchify(
+        self,
+        latents: torch.Tensor,
+    ) -> torch.Tensor:
+        latents = einops.rearrange(
+            latents,
+            "b c (f p1) (h p2) (w p3) -> b (f h w) (c p1 p2 p3)",
+            p1=self._patch_size[0],
+            p2=self._patch_size[1],
+            p3=self._patch_size[2],
+        )
+        return latents
+    def unpatchify(
+        self,
+        latents: torch.Tensor,
+        output_shape: VideoLatentShape,
+    ) -> torch.Tensor:
+        assert self._patch_size[0] == 1, "Temporal patch size must be 1 for symmetric patchifier"
+        patch_grid_frames = output_shape.frames // self._patch_size[0]
+        patch_grid_height = output_shape.height // self._patch_size[1]
+        patch_grid_width = output_shape.width // self._patch_size[2]
+        latents = einops.rearrange(
+            latents,
+            "b (f h w) (c p q) -> b c f (h p) (w q)",
+            f=patch_grid_frames,
+            h=patch_grid_height,
+            w=patch_grid_width,
+            p=self._patch_size[1],
+            q=self._patch_size[2],
+        )
+        return latents
+    def get_patch_grid_bounds(
+        self,
+        output_shape: AudioLatentShape | VideoLatentShape,
+        device: Optional[torch.device] = None,
+    ) -> torch.Tensor:
+        """
+        Return the per-dimension bounds [inclusive start, exclusive end) for every
+        patch produced by `patchify`. The bounds are expressed in the original
+        video grid coordinates: frame/time, height, and width.
+        The resulting tensor is shaped `[batch_size, 3, num_patches, 2]`, where:
+            - axis 1 (size 3) enumerates (frame/time, height, width) dimensions
+            - axis 3 (size 2) stores `[start, end)` indices within each dimension
+        Args:
+            output_shape: Video grid description containing frames, height, and width.
+            device: Device of the latent tensor.
+        """
+        if not isinstance(output_shape, VideoLatentShape):
+            raise ValueError("VideoLatentPatchifier expects VideoLatentShape when computing coordinates")
+        frames = output_shape.frames
+        height = output_shape.height
+        width = output_shape.width
+        batch_size = output_shape.batch
+        # Validate inputs to ensure positive dimensions
+        assert frames > 0, f"frames must be positive, got {frames}"
+        assert height > 0, f"height must be positive, got {height}"
+        assert width > 0, f"width must be positive, got {width}"
+        assert batch_size > 0, f"batch_size must be positive, got {batch_size}"
+        # Generate grid coordinates for each dimension (frame, height, width)
+        # We use torch.arange to create the starting coordinates for each patch.
+        # indexing='ij' ensures the dimensions are in the order (frame, height, width).
+        grid_coords = torch.meshgrid(
+            torch.arange(start=0, end=frames, step=self._patch_size[0], device=device),
+            torch.arange(start=0, end=height, step=self._patch_size[1], device=device),
+            torch.arange(start=0, end=width, step=self._patch_size[2], device=device),
+            indexing="ij",
+        )
+        # Stack the grid coordinates to create the start coordinates tensor.
+        # Shape becomes (3, grid_f, grid_h, grid_w)
+        patch_starts = torch.stack(grid_coords, dim=0)
+        # Create a tensor containing the size of a single patch:
+        # (frame_patch_size, height_patch_size, width_patch_size).
+        # Reshape to (3, 1, 1, 1) to enable broadcasting when adding to the start coordinates.
+        patch_size_delta = torch.tensor(
+            self._patch_size,
+            device=patch_starts.device,
+            dtype=patch_starts.dtype,
+        ).view(3, 1, 1, 1)
+        # Calculate end coordinates: start + patch_size
+        # Shape becomes (3, grid_f, grid_h, grid_w)
+        patch_ends = patch_starts + patch_size_delta
+        # Stack start and end coordinates together along the last dimension
+        # Shape becomes (3, grid_f, grid_h, grid_w, 2), where the last dimension is [start, end]
+        latent_coords = torch.stack((patch_starts, patch_ends), dim=-1)
+        # Broadcast to batch size and flatten all spatial/temporal dimensions into one sequence.
+        # Final Shape: (batch_size, 3, num_patches, 2)
+        latent_coords = einops.repeat(
+            latent_coords,
+            "c f h w bounds -> b c (f h w) bounds",
+            b=batch_size,
+            bounds=2,
+        )
+        return latent_coords
+def get_pixel_coords(
+    latent_coords: torch.Tensor,
+    scale_factors: SpatioTemporalScaleFactors,
+    causal_fix: bool = False,
+) -> torch.Tensor:
+    """
+    Map latent-space `[start, end)` coordinates to their pixel-space equivalents by scaling
+    each axis (frame/time, height, width) with the corresponding VAE downsampling factors.
+    Optionally compensate for causal encoding that keeps the first frame at unit temporal scale.
+    Args:
+        latent_coords: Tensor of latent bounds shaped `(batch, 3, num_patches, 2)`.
+        scale_factors: SpatioTemporalScaleFactors tuple `(temporal, height, width)` with integer scale factors applied
+        per axis.
+        causal_fix: When True, rewrites the temporal axis of the first frame so causal VAEs
+            that treat frame zero differently still yield non-negative timestamps.
+    """
+    # Broadcast the VAE scale factors so they align with the `(batch, axis, patch, bound)` layout.
+    broadcast_shape = [1] * latent_coords.ndim
+    broadcast_shape[1] = -1  # axis dimension corresponds to (frame/time, height, width)
+    scale_tensor = torch.tensor(scale_factors, device=latent_coords.device).view(*broadcast_shape)
+    # Apply per-axis scaling to convert latent bounds into pixel-space coordinates.
+    pixel_coords = latent_coords * scale_tensor
+    if causal_fix:
+        # VAE temporal stride for the very first frame is 1 instead of `scale_factors[0]`.
+        # Shift and clamp to keep the first-frame timestamps causal and non-negative.
+        pixel_coords[:, 0, ...] = (pixel_coords[:, 0, ...] + 1 - scale_factors[0]).clamp(min=0)
+    return pixel_coords
+class AudioPatchifier(Patchifier):
+    def __init__(
+        self,
+        patch_size: int,
+        sample_rate: int = 16000,
+        hop_length: int = 160,
+        audio_latent_downsample_factor: int = 4,
+        is_causal: bool = True,
+        shift: int = 0,
+    ):
+        """
+        Patchifier tailored for spectrogram/audio latents.
+        Args:
+            patch_size: Number of mel bins combined into a single patch. This
+                controls the resolution along the frequency axis.
+            sample_rate: Original waveform sampling rate. Used to map latent
+                indices back to seconds so downstream consumers can align audio
+                and video cues.
+            hop_length: Window hop length used for the spectrogram. Determines
+                how many real-time samples separate two consecutive latent frames.
+            audio_latent_downsample_factor: Ratio between spectrogram frames and
+                latent frames; compensates for additional downsampling inside the
+                VAE encoder.
+            is_causal: When True, timing is shifted to account for causal
+                receptive fields so timestamps do not peek into the future.
+            shift: Integer offset applied to the latent indices. Enables
+                constructing overlapping windows from the same latent sequence.
+        """
+        self.hop_length = hop_length
+        self.sample_rate = sample_rate
+        self.audio_latent_downsample_factor = audio_latent_downsample_factor
+        self.is_causal = is_causal
+        self.shift = shift
+        self._patch_size = (1, patch_size, patch_size)
+    @property
+    def patch_size(self) -> Tuple[int, int, int]:
+        return self._patch_size
+    def get_token_count(self, tgt_shape: AudioLatentShape) -> int:
+        return tgt_shape.frames
+    def _get_audio_latent_time_in_sec(
+        self,
+        start_latent: int,
+        end_latent: int,
+        dtype: torch.dtype,
+        device: Optional[torch.device] = None,
+    ) -> torch.Tensor:
+        """
+        Converts latent indices into real-time seconds while honoring causal
+        offsets and the configured hop length.
+        Args:
+            start_latent: Inclusive start index inside the latent sequence. This
+                sets the first timestamp returned.
+            end_latent: Exclusive end index. Determines how many timestamps get
+                generated.
+            dtype: Floating-point dtype used for the returned tensor, allowing
+                callers to control precision.
+            device: Target device for the timestamp tensor. When omitted the
+                computation occurs on CPU to avoid surprising GPU allocations.
+        """
+        if device is None:
+            device = torch.device("cpu")
+        audio_latent_frame = torch.arange(start_latent, end_latent, dtype=dtype, device=device)
+        audio_mel_frame = audio_latent_frame * self.audio_latent_downsample_factor
+        if self.is_causal:
+            # Frame offset for causal alignment.
+            # The "+1" ensures the timestamp corresponds to the first sample that is fully available.
+            causal_offset = 1
+            audio_mel_frame = (audio_mel_frame + causal_offset - self.audio_latent_downsample_factor).clip(min=0)
+        return audio_mel_frame * self.hop_length / self.sample_rate
+    def _compute_audio_timings(
+        self,
+        batch_size: int,
+        num_steps: int,
+        device: Optional[torch.device] = None,
+    ) -> torch.Tensor:
+        """
+        Builds a `(B, 1, T, 2)` tensor containing timestamps for each latent frame.
+        This helper method underpins `get_patch_grid_bounds` for the audio patchifier.
+        Args:
+            batch_size: Number of sequences to broadcast the timings over.
+            num_steps: Number of latent frames (time steps) to convert into timestamps.
+            device: Device on which the resulting tensor should reside.
+        """
+        resolved_device = device
+        if resolved_device is None:
+            resolved_device = torch.device("cpu")
+        start_timings = self._get_audio_latent_time_in_sec(
+            self.shift,
+            num_steps + self.shift,
+            torch.float32,
+            resolved_device,
+        )
+        start_timings = start_timings.unsqueeze(0).expand(batch_size, -1).unsqueeze(1)
+        end_timings = self._get_audio_latent_time_in_sec(
+            self.shift + 1,
+            num_steps + self.shift + 1,
+            torch.float32,
+            resolved_device,
+        )
+        end_timings = end_timings.unsqueeze(0).expand(batch_size, -1).unsqueeze(1)
+        return torch.stack([start_timings, end_timings], dim=-1)
+    def patchify(
+        self,
+        audio_latents: torch.Tensor,
+    ) -> torch.Tensor:
+        """
+        Flattens the audio latent tensor along time. Use `get_patch_grid_bounds`
+        to derive timestamps for each latent frame based on the configured hop
+        length and downsampling.
+        Args:
+            audio_latents: Latent tensor to patchify.
+        Returns:
+            Flattened patch tokens tensor. Use `get_patch_grid_bounds` to compute the
+            corresponding timing metadata when needed.
+        """
+        audio_latents = einops.rearrange(
+            audio_latents,
+            "b c t f -> b t (c f)",
+        )
+        return audio_latents
+    def unpatchify(
+        self,
+        audio_latents: torch.Tensor,
+        output_shape: AudioLatentShape,
+    ) -> torch.Tensor:
+        """
+        Restores the `(B, C, T, F)` spectrogram tensor from flattened patches.
+        Use `get_patch_grid_bounds` to recompute the timestamps that describe each
+        frame's position in real time.
+        Args:
+            audio_latents: Latent tensor to unpatchify.
+            output_shape: Shape of the unpatched output tensor.
+        Returns:
+            Unpatched latent tensor. Use `get_patch_grid_bounds` to compute the timing
+            metadata associated with the restored latents.
+        """
+        # audio_latents shape: (batch, time, freq * channels)
+        audio_latents = einops.rearrange(
+            audio_latents,
+            "b t (c f) -> b c t f",
+            c=output_shape.channels,
+            f=output_shape.mel_bins,
+        )
+        return audio_latents
+    def get_patch_grid_bounds(
+        self,
+        output_shape: AudioLatentShape | VideoLatentShape,
+        device: Optional[torch.device] = None,
+    ) -> torch.Tensor:
+        """
+        Return the temporal bounds `[inclusive start, exclusive end)` for every
+        patch emitted by `patchify`. For audio this corresponds to timestamps in
+        seconds aligned with the original spectrogram grid.
+        The returned tensor has shape `[batch_size, 1, time_steps, 2]`, where:
+            - axis 1 (size 1) represents the temporal dimension
+            - axis 3 (size 2) stores the `[start, end)` timestamps per patch
+        Args:
+            output_shape: Audio grid specification describing the number of time steps.
+            device: Target device for the returned tensor.
+        """
+        if not isinstance(output_shape, AudioLatentShape):
+            raise ValueError("AudioPatchifier expects AudioLatentShape when computing coordinates")
+        return self._compute_audio_timings(output_shape.batch, output_shape.frames, device)

packages/ltx-core/src/ltx_core/components/protocols.py ADDED Viewed

	@@ -0,0 +1,101 @@

+from typing import Protocol, Tuple
+import torch
+from ltx_core.types import AudioLatentShape, VideoLatentShape
+class Patchifier(Protocol):
+    """
+    Protocol for patchifiers that convert latent tensors into patches and assemble them back.
+    """
+    def patchify(
+        self,
+        latents: torch.Tensor,
+    ) -> torch.Tensor:
+        ...
+        """
+        Convert latent tensors into flattened patch tokens.
+        Args:
+            latents: Latent tensor to patchify.
+        Returns:
+            Flattened patch tokens tensor.
+        """
+    def unpatchify(
+        self,
+        latents: torch.Tensor,
+        output_shape: AudioLatentShape | VideoLatentShape,
+    ) -> torch.Tensor:
+        """
+        Converts latent tensors between spatio-temporal formats and flattened sequence representations.
+        Args:
+            latents: Patch tokens that must be rearranged back into the latent grid constructed by `patchify`.
+            output_shape: Shape of the output tensor. Note that output_shape is either AudioLatentShape or
+            VideoLatentShape.
+        Returns:
+            Dense latent tensor restored from the flattened representation.
+        """
+    @property
+    def patch_size(self) -> Tuple[int, int, int]:
+        ...
+        """
+        Returns the patch size as a tuple of (temporal, height, width) dimensions
+        """
+    def get_patch_grid_bounds(
+        self,
+        output_shape: AudioLatentShape | VideoLatentShape,
+        device: torch.device | None = None,
+    ) -> torch.Tensor:
+        ...
+        """
+        Compute metadata describing where each latent patch resides within the
+        grid specified by `output_shape`.
+        Args:
+            output_shape: Target grid layout for the patches.
+            device: Target device for the returned tensor.
+        Returns:
+            Tensor containing patch coordinate metadata such as spatial or temporal intervals.
+        """
+class SchedulerProtocol(Protocol):
+    """
+    Protocol for schedulers that provide a sigmas schedule tensor for a
+    given number of steps. Device is cpu.
+    """
+    def execute(self, steps: int, **kwargs) -> torch.FloatTensor: ...
+class GuiderProtocol(Protocol):
+    """
+    Protocol for guiders that compute a delta tensor given conditioning inputs.
+    The returned delta should be added to the conditional output (cond), enabling
+    multiple guiders to be chained together by accumulating their deltas.
+    """
+    scale: float
+    def delta(self, cond: torch.Tensor, uncond: torch.Tensor) -> torch.Tensor: ...
+    def enabled(self) -> bool:
+        """
+        Returns whether the corresponding perturbation is enabled. E.g. for CFG, this should return False if the scale
+        is 1.0.
+        """
+        ...
+class DiffusionStepProtocol(Protocol):
+    """
+    Protocol for diffusion steps that provide a next sample tensor for a given current sample tensor,
+    current denoised sample tensor, and sigmas tensor.
+    """
+    def step(
+        self, sample: torch.Tensor, denoised_sample: torch.Tensor, sigmas: torch.Tensor, step_index: int
+    ) -> torch.Tensor: ...

packages/ltx-core/src/ltx_core/components/schedulers.py ADDED Viewed

	@@ -0,0 +1,129 @@

+import math
+from functools import lru_cache
+import numpy
+import scipy
+import torch
+from ltx_core.components.protocols import SchedulerProtocol
+BASE_SHIFT_ANCHOR = 1024
+MAX_SHIFT_ANCHOR = 4096
+class LTX2Scheduler(SchedulerProtocol):
+    """
+    Default scheduler for LTX-2 diffusion sampling.
+    Generates a sigma schedule with token-count-dependent shifting and optional
+    stretching to a terminal value.
+    """
+    def execute(
+        self,
+        steps: int,
+        latent: torch.Tensor | None = None,
+        max_shift: float = 2.05,
+        base_shift: float = 0.95,
+        stretch: bool = True,
+        terminal: float = 0.1,
+        **_kwargs,
+    ) -> torch.FloatTensor:
+        tokens = math.prod(latent.shape[2:]) if latent is not None else MAX_SHIFT_ANCHOR
+        sigmas = torch.linspace(1.0, 0.0, steps + 1)
+        x1 = BASE_SHIFT_ANCHOR
+        x2 = MAX_SHIFT_ANCHOR
+        mm = (max_shift - base_shift) / (x2 - x1)
+        b = base_shift - mm * x1
+        sigma_shift = (tokens) * mm + b
+        power = 1
+        sigmas = torch.where(
+            sigmas != 0,
+            math.exp(sigma_shift) / (math.exp(sigma_shift) + (1 / sigmas - 1) ** power),
+            0,
+        )
+        # Stretch sigmas so that its final value matches the given terminal value.
+        if stretch:
+            non_zero_mask = sigmas != 0
+            non_zero_sigmas = sigmas[non_zero_mask]
+            one_minus_z = 1.0 - non_zero_sigmas
+            scale_factor = one_minus_z[-1] / (1.0 - terminal)
+            stretched = 1.0 - (one_minus_z / scale_factor)
+            sigmas[non_zero_mask] = stretched
+        return sigmas.to(torch.float32)
+class LinearQuadraticScheduler(SchedulerProtocol):
+    """
+    Scheduler with linear steps followed by quadratic steps.
+    Produces a sigma schedule that transitions linearly up to a threshold,
+    then follows a quadratic curve for the remaining steps.
+    """
+    def execute(
+        self, steps: int, threshold_noise: float = 0.025, linear_steps: int | None = None, **_kwargs
+    ) -> torch.FloatTensor:
+        if steps == 1:
+            return torch.FloatTensor([1.0, 0.0])
+        if linear_steps is None:
+            linear_steps = steps // 2
+        linear_sigma_schedule = [i * threshold_noise / linear_steps for i in range(linear_steps)]
+        threshold_noise_step_diff = linear_steps - threshold_noise * steps
+        quadratic_steps = steps - linear_steps
+        quadratic_sigma_schedule = []
+        if quadratic_steps > 0:
+            quadratic_coef = threshold_noise_step_diff / (linear_steps * quadratic_steps**2)
+            linear_coef = threshold_noise / linear_steps - 2 * threshold_noise_step_diff / (quadratic_steps**2)
+            const = quadratic_coef * (linear_steps**2)
+            quadratic_sigma_schedule = [
+                quadratic_coef * (i**2) + linear_coef * i + const for i in range(linear_steps, steps)
+            ]
+        sigma_schedule = linear_sigma_schedule + quadratic_sigma_schedule + [1.0]
+        sigma_schedule = [1.0 - x for x in sigma_schedule]
+        return torch.FloatTensor(sigma_schedule)
+class BetaScheduler(SchedulerProtocol):
+    """
+    Scheduler using a beta distribution to sample timesteps.
+    Based on: https://arxiv.org/abs/2407.12173
+    """
+    shift = 2.37
+    timesteps_length = 10000
+    def execute(self, steps: int, alpha: float = 0.6, beta: float = 0.6) -> torch.FloatTensor:
+        """
+        Execute the beta scheduler.
+        Args:
+            steps: The number of steps to execute the scheduler for.
+            alpha: The alpha parameter for the beta distribution.
+            beta: The beta parameter for the beta distribution.
+        Warnings:
+            The number of steps within `sigmas` theoretically might be less than `steps+1`,
+            because of the deduplication of the identical timesteps
+        Returns:
+            A tensor of sigmas.
+        """
+        model_sampling_sigmas = _precalculate_model_sampling_sigmas(self.shift, self.timesteps_length)
+        total_timesteps = len(model_sampling_sigmas) - 1
+        ts = 1 - numpy.linspace(0, 1, steps, endpoint=False)
+        ts = numpy.rint(scipy.stats.beta.ppf(ts, alpha, beta) * total_timesteps).tolist()
+        ts = list(dict.fromkeys(ts))
+        sigmas = [float(model_sampling_sigmas[int(t)]) for t in ts] + [0.0]
+        return torch.FloatTensor(sigmas)
+@lru_cache(maxsize=5)
+def _precalculate_model_sampling_sigmas(shift: float, timesteps_length: int) -> torch.Tensor:
+    timesteps = torch.arange(1, timesteps_length + 1, 1) / timesteps_length
+    return torch.Tensor([flux_time_shift(shift, 1.0, t) for t in timesteps])
+def flux_time_shift(mu: float, sigma: float, t: float) -> float:
+    return math.exp(mu) / (math.exp(mu) + (1 / t - 1) ** sigma)