SUPIR

Running on Zero

App Files Files Community

Fabrice-TIERCELIN commited on 25 days ago

Commit

2bbc972

verified ·

1 Parent(s): b9fe529

Upload 4 files

Browse files

Files changed (3) hide show

packages/ltx-core/src/ltx_core/tools.py +184 -0
packages/ltx-core/src/ltx_core/types.py +181 -0
packages/ltx-core/src/ltx_core/utils.py +0 -6

packages/ltx-core/src/ltx_core/tools.py ADDED Viewed

	@@ -0,0 +1,184 @@

+from dataclasses import dataclass, replace
+from typing import Protocol
+import torch
+from torch._prims_common import DeviceLikeType
+from ltx_core.components.patchifiers import (
+    AudioLatentShape,
+    AudioPatchifier,
+    VideoLatentPatchifier,
+    VideoLatentShape,
+    get_pixel_coords,
+)
+from ltx_core.components.protocols import Patchifier
+from ltx_core.types import LatentState, SpatioTemporalScaleFactors
+DEFAULT_SCALE_FACTORS = SpatioTemporalScaleFactors.default()
+class LatentTools(Protocol):
+    """
+    Tools for building latent states.
+    """
+    patchifier: Patchifier
+    target_shape: VideoLatentShape | AudioLatentShape
+    def create_initial_state(
+        self,
+        device: DeviceLikeType,
+        dtype: torch.dtype,
+        initial_latent: torch.Tensor | None = None,
+    ) -> LatentState:
+        """
+        Create an initial latent state. If initial_latent is provided, it will be used to create the latent state.
+        """
+        ...
+    def patchify(self, latent_state: LatentState) -> LatentState:
+        """
+        Patchify the latent state.
+        """
+        if latent_state.latent.shape != self.target_shape.to_torch_shape():
+            raise ValueError(
+                f"Latent state has shape {latent_state.latent.shape}, expected shape is "
+                f"{self.target_shape.to_torch_shape()}"
+            )
+        latent_state = latent_state.clone()
+        latent = self.patchifier.patchify(latent_state.latent)
+        clean_latent = self.patchifier.patchify(latent_state.clean_latent)
+        denoise_mask = self.patchifier.patchify(latent_state.denoise_mask)
+        return replace(latent_state, latent=latent, denoise_mask=denoise_mask, clean_latent=clean_latent)
+    def unpatchify(self, latent_state: LatentState) -> LatentState:
+        """
+        Unpatchify the latent state.
+        """
+        latent_state = latent_state.clone()
+        latent = self.patchifier.unpatchify(latent_state.latent, output_shape=self.target_shape)
+        clean_latent = self.patchifier.unpatchify(latent_state.clean_latent, output_shape=self.target_shape)
+        denoise_mask = self.patchifier.unpatchify(
+            latent_state.denoise_mask, output_shape=self.target_shape.mask_shape()
+        )
+        return replace(latent_state, latent=latent, denoise_mask=denoise_mask, clean_latent=clean_latent)
+    def clear_conditioning(self, latent_state: LatentState) -> LatentState:
+        """
+        Clear the conditioning from the latent state. This method removes extra tokens from the end of the latent.
+        Therefore, conditioning items should add extra tokens ONLY to the end of the latent.
+        """
+        latent_state = latent_state.clone()
+        num_tokens = self.patchifier.get_token_count(self.target_shape)
+        latent = latent_state.latent[:, :num_tokens]
+        clean_latent = latent_state.clean_latent[:, :num_tokens]
+        denoise_mask = torch.ones_like(latent_state.denoise_mask)[:, :num_tokens]
+        positions = latent_state.positions[:, :, :num_tokens]
+        return LatentState(latent=latent, denoise_mask=denoise_mask, positions=positions, clean_latent=clean_latent)
+@dataclass(frozen=True)
+class VideoLatentTools(LatentTools):
+    """
+    Tools for building video latent states.
+    """
+    patchifier: VideoLatentPatchifier
+    target_shape: VideoLatentShape
+    fps: float
+    scale_factors: SpatioTemporalScaleFactors = DEFAULT_SCALE_FACTORS
+    causal_fix: bool = True
+    def create_initial_state(
+        self,
+        device: DeviceLikeType,
+        dtype: torch.dtype,
+        initial_latent: torch.Tensor | None = None,
+    ) -> LatentState:
+        if initial_latent is not None:
+            assert initial_latent.shape == self.target_shape.to_torch_shape(), (
+                f"Latent shape {initial_latent.shape} does not match target shape {self.target_shape.to_torch_shape()}"
+            )
+        else:
+            initial_latent = torch.zeros(
+                *self.target_shape.to_torch_shape(),
+                device=device,
+                dtype=dtype,
+            )
+        clean_latent = initial_latent.clone()
+        denoise_mask = torch.ones(
+            *self.target_shape.mask_shape().to_torch_shape(),
+            device=device,
+            dtype=torch.float32,
+        )
+        latent_coords = self.patchifier.get_patch_grid_bounds(
+            output_shape=self.target_shape,
+            device=device,
+        )
+        positions = get_pixel_coords(
+            latent_coords=latent_coords,
+            scale_factors=self.scale_factors,
+            causal_fix=self.causal_fix,
+        ).float()
+        positions[:, 0, ...] = positions[:, 0, ...] / self.fps
+        return self.patchify(
+            LatentState(
+                latent=initial_latent,
+                denoise_mask=denoise_mask,
+                positions=positions.to(dtype),
+                clean_latent=clean_latent,
+            )
+        )
+@dataclass(frozen=True)
+class AudioLatentTools(LatentTools):
+    """
+    Tools for building audio latent states.
+    """
+    patchifier: AudioPatchifier
+    target_shape: AudioLatentShape
+    def create_initial_state(
+        self,
+        device: DeviceLikeType,
+        dtype: torch.dtype,
+        initial_latent: torch.Tensor | None = None,
+    ) -> LatentState:
+        if initial_latent is not None:
+            assert initial_latent.shape == self.target_shape.to_torch_shape(), (
+                f"Latent shape {initial_latent.shape} does not match target shape {self.target_shape.to_torch_shape()}"
+            )
+        else:
+            initial_latent = torch.zeros(
+                *self.target_shape.to_torch_shape(),
+                device=device,
+                dtype=dtype,
+            )
+        clean_latent = initial_latent.clone()
+        denoise_mask = torch.ones(
+            *self.target_shape.mask_shape().to_torch_shape(),
+            device=device,
+            dtype=torch.float32,
+        )
+        latent_coords = self.patchifier.get_patch_grid_bounds(
+            output_shape=self.target_shape,
+            device=device,
+        )
+        return self.patchify(
+            LatentState(
+                latent=initial_latent, denoise_mask=denoise_mask, positions=latent_coords, clean_latent=clean_latent
+            )
+        )

packages/ltx-core/src/ltx_core/types.py ADDED Viewed

	@@ -0,0 +1,181 @@

+from dataclasses import dataclass
+from typing import NamedTuple
+import torch
+class VideoPixelShape(NamedTuple):
+    """
+    Shape of the tensor representing the video pixel array. Assumes BGR channel format.
+    """
+    batch: int
+    frames: int
+    height: int
+    width: int
+    fps: float
+class SpatioTemporalScaleFactors(NamedTuple):
+    """
+    Describes the spatiotemporal downscaling between decoded video space and
+    the corresponding VAE latent grid.
+    """
+    time: int
+    width: int
+    height: int
+    @classmethod
+    def default(cls) -> "SpatioTemporalScaleFactors":
+        return cls(time=8, width=32, height=32)
+VIDEO_SCALE_FACTORS = SpatioTemporalScaleFactors.default()
+class VideoLatentShape(NamedTuple):
+    """
+    Shape of the tensor representing video in VAE latent space.
+    The latent representation is a 5D tensor with dimensions ordered as
+    (batch, channels, frames, height, width). Spatial and temporal dimensions
+    are downscaled relative to pixel space according to the VAE's scale factors.
+    """
+    batch: int
+    channels: int
+    frames: int
+    height: int
+    width: int
+    def to_torch_shape(self) -> torch.Size:
+        return torch.Size([self.batch, self.channels, self.frames, self.height, self.width])
+    @staticmethod
+    def from_torch_shape(shape: torch.Size) -> "VideoLatentShape":
+        return VideoLatentShape(
+            batch=shape[0],
+            channels=shape[1],
+            frames=shape[2],
+            height=shape[3],
+            width=shape[4],
+        )
+    def mask_shape(self) -> "VideoLatentShape":
+        return self._replace(channels=1)
+    @staticmethod
+    def from_pixel_shape(
+        shape: VideoPixelShape,
+        latent_channels: int = 128,
+        scale_factors: SpatioTemporalScaleFactors = VIDEO_SCALE_FACTORS,
+    ) -> "VideoLatentShape":
+        frames = (shape.frames - 1) // scale_factors[0] + 1
+        height = shape.height // scale_factors[1]
+        width = shape.width // scale_factors[2]
+        return VideoLatentShape(
+            batch=shape.batch,
+            channels=latent_channels,
+            frames=frames,
+            height=height,
+            width=width,
+        )
+    def upscale(self, scale_factors: SpatioTemporalScaleFactors = VIDEO_SCALE_FACTORS) -> "VideoLatentShape":
+        return self._replace(
+            channels=3,
+            frames=(self.frames - 1) * scale_factors.time + 1,
+            height=self.height * scale_factors.height,
+            width=self.width * scale_factors.width,
+        )
+class AudioLatentShape(NamedTuple):
+    """
+    Shape of audio in VAE latent space: (batch, channels, frames, mel_bins).
+    mel_bins is the number of frequency bins from the mel-spectrogram encoding.
+    """
+    batch: int
+    channels: int
+    frames: int
+    mel_bins: int
+    def to_torch_shape(self) -> torch.Size:
+        return torch.Size([self.batch, self.channels, self.frames, self.mel_bins])
+    def mask_shape(self) -> "AudioLatentShape":
+        return self._replace(channels=1, mel_bins=1)
+    @staticmethod
+    def from_torch_shape(shape: torch.Size) -> "AudioLatentShape":
+        return AudioLatentShape(
+            batch=shape[0],
+            channels=shape[1],
+            frames=shape[2],
+            mel_bins=shape[3],
+        )
+    @staticmethod
+    def from_duration(
+        batch: int,
+        duration: float,
+        channels: int = 8,
+        mel_bins: int = 16,
+        sample_rate: int = 16000,
+        hop_length: int = 160,
+        audio_latent_downsample_factor: int = 4,
+    ) -> "AudioLatentShape":
+        latents_per_second = float(sample_rate) / float(hop_length) / float(audio_latent_downsample_factor)
+        return AudioLatentShape(
+            batch=batch,
+            channels=channels,
+            frames=round(duration * latents_per_second),
+            mel_bins=mel_bins,
+        )
+    @staticmethod
+    def from_video_pixel_shape(
+        shape: VideoPixelShape,
+        channels: int = 8,
+        mel_bins: int = 16,
+        sample_rate: int = 16000,
+        hop_length: int = 160,
+        audio_latent_downsample_factor: int = 4,
+    ) -> "AudioLatentShape":
+        return AudioLatentShape.from_duration(
+            batch=shape.batch,
+            duration=float(shape.frames) / float(shape.fps),
+            channels=channels,
+            mel_bins=mel_bins,
+            sample_rate=sample_rate,
+            hop_length=hop_length,
+            audio_latent_downsample_factor=audio_latent_downsample_factor,
+        )
+@dataclass(frozen=True)
+class LatentState:
+    """
+    State of latents during the diffusion denoising process.
+    Attributes:
+        latent: The current noisy latent tensor being denoised.
+        denoise_mask: Mask encoding the denoising strength for each token (1 = full denoising, 0 = no denoising).
+        positions: Positional indices for each latent element, used for positional embeddings.
+        clean_latent: Initial state of the latent before denoising, may include conditioning latents.
+    """
+    latent: torch.Tensor
+    denoise_mask: torch.Tensor
+    positions: torch.Tensor
+    clean_latent: torch.Tensor
+    def clone(self) -> "LatentState":
+        return LatentState(
+            latent=self.latent.clone(),
+            denoise_mask=self.denoise_mask.clone(),
+            positions=self.positions.clone(),
+            clean_latent=self.clean_latent.clone(),
+        )

packages/ltx-core/src/ltx_core/utils.py CHANGED Viewed

@@ -1,6 +1,3 @@
-# Copyright (c) 2025 Lightricks. All rights reserved.
-# Created by Amit Pintz.
 from typing import Any
 import torch
@@ -8,7 +5,6 @@ import torch
 def rms_norm(x: torch.Tensor, weight: torch.Tensor | None = None, eps: float = 1e-6) -> torch.Tensor:
     """Root-mean-square (RMS) normalize `x` over its last dimension.
     Thin wrapper around `torch.nn.functional.rms_norm` that infers the normalized
     shape and forwards `weight` and `eps`.
     """
@@ -29,7 +25,6 @@ def to_velocity(
 ) -> torch.Tensor:
     """
     Convert the sample and its denoised version to velocity.
     Returns:
         Velocity
     """
@@ -48,7 +43,6 @@ def to_denoised(
 ) -> torch.Tensor:
     """
     Convert the sample and its denoising velocity to denoised sample.
     Returns:
         Denoised sample
     """

 from typing import Any
 import torch
 def rms_norm(x: torch.Tensor, weight: torch.Tensor | None = None, eps: float = 1e-6) -> torch.Tensor:
     """Root-mean-square (RMS) normalize `x` over its last dimension.
     Thin wrapper around `torch.nn.functional.rms_norm` that infers the normalized
     shape and forwards `weight` and `eps`.
     """
 ) -> torch.Tensor:
     """
     Convert the sample and its denoised version to velocity.
     Returns:
         Velocity
     """
 ) -> torch.Tensor:
     """
     Convert the sample and its denoising velocity to denoised sample.
     Returns:
         Denoised sample
     """