linoyts
/

ltx2-audio-video-conditioning

@@ -12,50 +12,35 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""
-LTX-2 Audio-to-Video Pipeline with Video Conditioning Support
-This is a modified version of the LTX2AudioToVideoPipeline that adds support for
-video conditioning, enabling avatar/face-swap generation workflows.
-Usage:
-    pipe = DiffusionPipeline.from_pretrained(
-        "rootonchair/LTX-2-19b-distilled",
-        custom_pipeline="path/to/this/file",
-        torch_dtype=torch.bfloat16
-    )
-    # With video conditioning (for avatar/face-swap):
-    video, audio = pipe(
-        image=face_image,           # The face/appearance to use
-        video=reference_video,       # Video for motion conditioning
-        audio="path/to/audio.wav",   # Audio (or extracted from video)
-        prompt="head_swap, a person speaking...",
-        ...
-    )
-"""
 import copy
 import inspect
 from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 import numpy as np
 import torch
 import torchaudio
 import torchaudio.transforms as T
-from PIL import Image
 from transformers import Gemma3ForConditionalGeneration, GemmaTokenizer, GemmaTokenizerFast
 from diffusers.callbacks import MultiPipelineCallbacks, PipelineCallback
-from diffusers.image_processor import PipelineImageInput
 from diffusers.loaders import FromSingleFileMixin, LTXVideoLoraLoaderMixin
-from diffusers.models.autoencoders import AutoencoderKLLTX2Audio, AutoencoderKLLTX2Video
 from diffusers.models.transformers import LTX2VideoTransformer3DModel
 from diffusers.schedulers import FlowMatchEulerDiscreteScheduler
-from diffusers.utils import is_torch_xla_available, logging, replace_example_docstring
 from diffusers.utils.torch_utils import randn_tensor
 from diffusers.video_processor import VideoProcessor
 from diffusers.pipelines.pipeline_utils import DiffusionPipeline
 from diffusers.pipelines.ltx2.connectors import LTX2TextConnectors
 from diffusers.pipelines.ltx2.pipeline_output import LTX2PipelineOutput
 from diffusers.pipelines.ltx2.vocoder import LTX2Vocoder
@@ -63,51 +48,86 @@ from diffusers.pipelines.ltx2.vocoder import LTX2Vocoder
 if is_torch_xla_available():
     import torch_xla.core.xla_model as xm
     XLA_AVAILABLE = True
 else:
     XLA_AVAILABLE = False
-logger = logging.get_logger(__name__)
 EXAMPLE_DOC_STRING = """
     Examples:
         ```py
         >>> import torch
-        >>> from diffusers import DiffusionPipeline
         >>> from diffusers.utils import load_image
-        >>> pipe = DiffusionPipeline.from_pretrained(
-        ...     "rootonchair/LTX-2-19b-distilled",
-        ...     custom_pipeline="pipeline_ltx2_avatar",
-        ...     torch_dtype=torch.bfloat16
         ... )
-        >>> pipe.to("cuda")
-        >>> # Load face swap LoRA
-        >>> pipe.load_lora_weights(
-        ...     "Alissonerdx/BFS-Best-Face-Swap-Video",
-        ...     weight_name="ltx-2/head_swap_v1_13500_first_frame.safetensors",
         ... )
-        >>> pipe.fuse_lora(lora_scale=1.1)
-        >>> face_image = load_image("face.png")
-        >>> video, audio = pipe(
-        ...     image=face_image,
-        ...     video="reference_video.mp4",  # Motion reference
-        ...     video_conditioning_strength=1.0,  # How strongly to follow motion
-        ...     video_conditioning_frame_idx=1,  # Frame 0 = face, Frame 1+ = video motion
-        ...     audio="reference_video.mp4",  # Audio extracted from video
-        ...     prompt="head_swap, a person speaking naturally",
-        ...     width=512,
-        ...     height=768,
         ...     num_frames=121,
         ...     return_dict=False,
         ... )
         ```
 """
 def retrieve_latents(
     encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
 ):
@@ -121,6 +141,7 @@ def retrieve_latents(
         raise AttributeError("Could not access latents of provided encoder_output")
 def calculate_shift(
     image_seq_len,
     base_seq_len: int = 256,
@@ -134,6 +155,7 @@ def calculate_shift(
     return mu
 def retrieve_timesteps(
     scheduler,
     num_inference_steps: Optional[int] = None,
@@ -142,13 +164,37 @@ def retrieve_timesteps(
     sigmas: Optional[List[float]] = None,
     **kwargs,
 ):
     if timesteps is not None and sigmas is not None:
-        raise ValueError("Only one of `timesteps` or `sigmas` can be passed.")
     if timesteps is not None:
         accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
         if not accepts_timesteps:
             raise ValueError(
-                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom timestep schedules."
             )
         scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
         timesteps = scheduler.timesteps
@@ -157,7 +203,8 @@ def retrieve_timesteps(
         accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
         if not accept_sigmas:
             raise ValueError(
-                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom sigmas schedules."
             )
         scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
         timesteps = scheduler.timesteps
@@ -168,7 +215,24 @@ def retrieve_timesteps(
     return timesteps, num_inference_steps
 def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
     std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
     std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
     noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
@@ -176,17 +240,13 @@ def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
     return noise_cfg
-class LTX2AvatarPipeline(DiffusionPipeline, FromSingleFileMixin, LTXVideoLoraLoaderMixin):
     r"""
-    Pipeline for avatar/face-swap video generation with audio and video conditioning.
-    This pipeline generates video conditioned on:
-    - An input image (the face/appearance to use)
-    - A reference video (for motion/pose conditioning)
-    - Input audio (for lip-sync)
-    This enables avatar generation where the face from the image is animated
-    to match the motion from the reference video and synced to the audio.
     """
     model_cpu_offload_seq = "text_encoder->connectors->transformer->vae->audio_vae->vocoder"
@@ -223,6 +283,7 @@ class LTX2AvatarPipeline(DiffusionPipeline, FromSingleFileMixin, LTXVideoLoraLoa
         self.vae_temporal_compression_ratio = (
             self.vae.temporal_compression_ratio if getattr(self, "vae", None) is not None else 8
         )
         self.audio_vae_mel_compression_ratio = (
             self.audio_vae.mel_compression_ratio if getattr(self, "audio_vae", None) is not None else 4
         )
@@ -248,123 +309,8 @@ class LTX2AvatarPipeline(DiffusionPipeline, FromSingleFileMixin, LTXVideoLoraLoa
             self.tokenizer.model_max_length if getattr(self, "tokenizer", None) is not None else 1024
         )
-    # ==================== Video Conditioning Methods ====================
-    def _load_video_frames(
-        self,
-        video: Union[str, List[Image.Image], torch.Tensor],
-        height: int,
-        width: int,
-        num_frames: int,
-        device: torch.device,
-        dtype: torch.dtype,
-    ) -> torch.Tensor:
-        """
-        Load and preprocess video frames for conditioning.
-        Args:
-            video: Path to video file, list of PIL images, or tensor of frames
-            height: Target height
-            width: Target width
-            num_frames: Number of frames to extract/use
-            device: Target device
-            dtype: Target dtype
-        Returns:
-            Tensor of shape (batch, channels, num_frames, height, width)
-        """
-        if isinstance(video, str):
-            # Load from file
-            frames = self._decode_video_file(video, num_frames)
-        elif isinstance(video, list):
-            # List of PIL images
-            frames = [np.array(img.convert("RGB")) for img in video]
-        elif isinstance(video, torch.Tensor):
-            # Already a tensor
-            if video.ndim == 4:  # (F, H, W, C) or (F, C, H, W)
-                if video.shape[-1] in [1, 3, 4]:  # (F, H, W, C)
-                    frames = [video[i].cpu().numpy() for i in range(video.shape[0])]
-                else:  # (F, C, H, W)
-                    frames = [video[i].permute(1, 2, 0).cpu().numpy() for i in range(video.shape[0])]
-            else:
-                raise ValueError(f"Unexpected video tensor shape: {video.shape}")
-        else:
-            raise TypeError(f"Unsupported video type: {type(video)}")
-        # Handle frame count
-        if len(frames) >= num_frames:
-            frames = frames[:num_frames]
-        else:
-            # Pad with last frame
-            last_frame = frames[-1]
-            while len(frames) < num_frames:
-                frames.append(last_frame)
-        # Process each frame
-        processed_frames = []
-        for frame in frames:
-            if isinstance(frame, np.ndarray):
-                frame = Image.fromarray(frame.astype(np.uint8))
-            # Resize to target dimensions
-            frame = frame.resize((width, height), Image.LANCZOS)
-            frame = np.array(frame)
-            # Normalize to [-1, 1]
-            frame = (frame.astype(np.float32) / 127.5) - 1.0
-            processed_frames.append(frame)
-        # Stack frames: (F, H, W, C) -> (1, C, F, H, W)
-        frames_array = np.stack(processed_frames, axis=0)  # (F, H, W, C)
-        frames_tensor = torch.from_numpy(frames_array).permute(3, 0, 1, 2).unsqueeze(0)  # (1, C, F, H, W)
-        return frames_tensor.to(device=device, dtype=dtype)
-    def _decode_video_file(self, video_path: str, max_frames: int) -> List[np.ndarray]:
-        """Decode video file to list of numpy arrays."""
-        try:
-            import av
-        except ImportError:
-            raise ImportError("Please install av: pip install av")
-        frames = []
-        container = av.open(video_path)
-        try:
-            video_stream = next(s for s in container.streams if s.type == "video")
-            for frame in container.decode(video_stream):
-                frames.append(frame.to_rgb().to_ndarray())
-                if len(frames) >= max_frames:
-                    break
-        finally:
-            container.close()
-        return frames
-    def _encode_video_conditioning(
-        self,
-        video: torch.Tensor,
-        generator: Optional[torch.Generator] = None,
-    ) -> torch.Tensor:
-        """
-        Encode video frames through the VAE to get latents.
-        Args:
-            video: Video tensor of shape (batch, channels, frames, height, width)
-            generator: Random generator for sampling
-        Returns:
-            Video latents
-        """
-        # Encode each frame through VAE
-        # VAE expects (batch, channels, frames, height, width)
-        video = video.to(device=self.vae.device, dtype=self.vae.dtype).contiguous()
-        latents = retrieve_latents(self.vae.encode(video), generator, "argmax")
-        return latents
-    # ==================== Text Encoding Methods ====================
     @staticmethod
     def _pack_text_embeds(
         text_hidden_states: torch.Tensor,
         sequence_lengths: torch.Tensor,
@@ -402,6 +348,7 @@ class LTX2AvatarPipeline(DiffusionPipeline, FromSingleFileMixin, LTXVideoLoraLoa
         normalized_hidden_states = normalized_hidden_states.to(dtype=original_dtype)
         return normalized_hidden_states
     def _get_gemma_prompt_embeds(
         self,
         prompt: Union[str, List[str]],
@@ -461,6 +408,7 @@ class LTX2AvatarPipeline(DiffusionPipeline, FromSingleFileMixin, LTXVideoLoraLoa
         return prompt_embeds, prompt_attention_mask
     def encode_prompt(
         self,
         prompt: Union[str, List[str]],
@@ -500,11 +448,14 @@ class LTX2AvatarPipeline(DiffusionPipeline, FromSingleFileMixin, LTXVideoLoraLoa
             if prompt is not None and type(prompt) is not type(negative_prompt):
                 raise TypeError(
-                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} != {type(prompt)}."
                 )
             elif batch_size != len(negative_prompt):
                 raise ValueError(
-                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`: {prompt} has batch size {batch_size}."
                 )
             negative_prompt_embeds, negative_prompt_attention_mask = self._get_gemma_prompt_embeds(
@@ -536,13 +487,18 @@ class LTX2AvatarPipeline(DiffusionPipeline, FromSingleFileMixin, LTXVideoLoraLoa
             k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
         ):
             raise ValueError(
-                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}"
             )
         if prompt is not None and prompt_embeds is not None:
-            raise ValueError("Cannot forward both `prompt` and `prompt_embeds`.")
         elif prompt is None and prompt_embeds is None:
-            raise ValueError("Provide either `prompt` or `prompt_embeds`.")
         elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
             raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
@@ -552,9 +508,22 @@ class LTX2AvatarPipeline(DiffusionPipeline, FromSingleFileMixin, LTXVideoLoraLoa
         if negative_prompt_embeds is not None and negative_prompt_attention_mask is None:
             raise ValueError("Must provide `negative_prompt_attention_mask` when specifying `negative_prompt_embeds`.")
-    # ==================== Latent Packing/Unpacking ====================
     @staticmethod
     def _pack_latents(latents: torch.Tensor, patch_size: int = 1, patch_size_t: int = 1) -> torch.Tensor:
         batch_size, num_channels, num_frames, height, width = latents.shape
         post_patch_num_frames = num_frames // patch_size_t
@@ -574,6 +543,7 @@ class LTX2AvatarPipeline(DiffusionPipeline, FromSingleFileMixin, LTXVideoLoraLoa
         return latents
     @staticmethod
     def _unpack_latents(
         latents: torch.Tensor, num_frames: int, height: int, width: int, patch_size: int = 1, patch_size_t: int = 1
     ) -> torch.Tensor:
@@ -592,6 +562,7 @@ class LTX2AvatarPipeline(DiffusionPipeline, FromSingleFileMixin, LTXVideoLoraLoa
         return latents
     @staticmethod
     def _denormalize_latents(
         latents: torch.Tensor, latents_mean: torch.Tensor, latents_std: torch.Tensor, scaling_factor: float = 1.0
     ) -> torch.Tensor:
@@ -600,9 +571,17 @@ class LTX2AvatarPipeline(DiffusionPipeline, FromSingleFileMixin, LTXVideoLoraLoa
         latents = latents * latents_std / scaling_factor + latents_mean
         return latents
-    # ==================== Audio Latent Methods ====================
     @staticmethod
     def _pack_audio_latents(
         latents: torch.Tensor, patch_size: Optional[int] = None, patch_size_t: Optional[int] = None
     ) -> torch.Tensor:
@@ -619,6 +598,7 @@ class LTX2AvatarPipeline(DiffusionPipeline, FromSingleFileMixin, LTXVideoLoraLoa
         return latents
     @staticmethod
     def _unpack_audio_latents(
         latents: torch.Tensor,
         latent_length: int,
@@ -635,29 +615,191 @@ class LTX2AvatarPipeline(DiffusionPipeline, FromSingleFileMixin, LTXVideoLoraLoa
         return latents
     @staticmethod
-    def _denormalize_audio_latents(latents: torch.Tensor, latents_mean: torch.Tensor, latents_std: torch.Tensor):
         latents_mean = latents_mean.to(latents.device, latents.dtype)
         latents_std = latents_std.to(latents.device, latents.dtype)
-        return (latents * latents_std) + latents_mean
     @staticmethod
-    def _normalize_audio_latents(latents: torch.Tensor, latents_mean: torch.Tensor, latents_std: torch.Tensor):
         latents_mean = latents_mean.to(latents.device, latents.dtype)
         latents_std = latents_std.to(latents.device, latents.dtype)
-        return (latents - latents_mean) / latents_std
-    @staticmethod
-    def _patchify_audio_latents(latents: torch.Tensor) -> torch.Tensor:
-        batch, channels, time, freq = latents.shape
-        return latents.permute(0, 2, 1, 3).reshape(batch, time, channels * freq)
-    @staticmethod
-    def _unpatchify_audio_latents(latents: torch.Tensor, channels: int, freq: int) -> torch.Tensor:
-        batch, time, _ = latents.shape
-        return latents.reshape(batch, time, channels, freq).permute(0, 2, 1, 3)
     def _preprocess_audio(self, audio: Union[str, torch.Tensor], target_sample_rate: int) -> torch.Tensor:
-        """Process audio to mel spectrogram."""
         if isinstance(audio, str):
             waveform, sr = torchaudio.load(audio)
         else:
@@ -667,12 +809,14 @@ class LTX2AvatarPipeline(DiffusionPipeline, FromSingleFileMixin, LTXVideoLoraLoa
         if sr != target_sample_rate:
             waveform = torchaudio.functional.resample(waveform, sr, target_sample_rate)
         if waveform.shape[0] == 1:
             waveform = waveform.repeat(2, 1)
         elif waveform.shape[0] > 2:
             waveform = waveform[:2, :]
-        waveform = waveform.unsqueeze(0)
         n_fft = 1024
         mel_transform = T.MelSpectrogram(
@@ -691,208 +835,86 @@ class LTX2AvatarPipeline(DiffusionPipeline, FromSingleFileMixin, LTXVideoLoraLoa
             norm="slaney",
         )
-        mel_spec = mel_transform(waveform)
         mel_spec = torch.log(torch.clamp(mel_spec, min=1e-5))
-        mel_spec = mel_spec.permute(0, 1, 3, 2).contiguous()
         return mel_spec
-    # ==================== Latent Preparation ====================
-    def prepare_latents(
-        self,
-        image: Optional[torch.Tensor] = None,
-        video: Optional[torch.Tensor] = None,
-        video_conditioning_strength: float = 1.0,
-        video_conditioning_frame_idx: int = 1,
-        batch_size: int = 1,
-        num_channels_latents: int = 128,
-        height: int = 512,
-        width: int = 704,
-        num_frames: int = 161,
-        dtype: Optional[torch.dtype] = None,
-        device: Optional[torch.device] = None,
-        generator: Optional[torch.Generator] = None,
-        latents: Optional[torch.Tensor] = None,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        """
-        Prepare latents for generation with optional video conditioning.
-        Args:
-            image: Input image for frame 0 conditioning
-            video: Video tensor for motion conditioning
-            video_conditioning_strength: Strength of video conditioning (0-1)
-            video_conditioning_frame_idx: Frame index where video conditioning starts.
-                - 0: Video conditioning replaces all frames including frame 0
-                - 1: Frame 0 is image-conditioned, frames 1+ are video-conditioned (default for face-swap)
-                - N: Frames 0 to N-1 are image/noise, frames N+ are video-conditioned
-            ... other args ...
-        """
-        latent_height = height // self.vae_spatial_compression_ratio
-        latent_width = width // self.vae_spatial_compression_ratio
-        latent_num_frames = (num_frames - 1) // self.vae_temporal_compression_ratio + 1
-        shape = (batch_size, num_channels_latents, latent_num_frames, latent_height, latent_width)
-        mask_shape = (batch_size, 1, latent_num_frames, latent_height, latent_width)
-        if latents is not None:
-            conditioning_mask = latents.new_zeros(mask_shape)
-            conditioning_mask[:, :, 0] = 1.0
-            conditioning_mask = self._pack_latents(
-                conditioning_mask, self.transformer_spatial_patch_size, self.transformer_temporal_patch_size
-            ).squeeze(-1)
-            return latents.to(device=device, dtype=dtype), conditioning_mask
-        # Initialize conditioning mask (all zeros = fully denoise)
-        conditioning_mask = torch.zeros(mask_shape, device=device, dtype=dtype)
-        # Initialize latents tensor
-        init_latents = torch.zeros(shape, device=device, dtype=dtype)
-        # Case 1: Video conditioning (motion from reference video)
-        if video is not None:
-            # Encode video through VAE
-            video_latents = self._encode_video_conditioning(video, generator)
-            video_latents = self._normalize_latents(video_latents, self.vae.latents_mean, self.vae.latents_std)
-            # Ensure video latents match target shape
-            if video_latents.shape[2] < latent_num_frames:
-                # Pad with last frame
-                pad_frames = latent_num_frames - video_latents.shape[2]
-                last_frame = video_latents[:, :, -1:, :, :]
-                video_latents = torch.cat([video_latents, last_frame.repeat(1, 1, pad_frames, 1, 1)], dim=2)
-            elif video_latents.shape[2] > latent_num_frames:
-                video_latents = video_latents[:, :, :latent_num_frames, :, :]
-            # Calculate the latent frame index for video conditioning
-            # video_conditioning_frame_idx is in pixel space, convert to latent space
-            latent_video_start_idx = video_conditioning_frame_idx // self.vae_temporal_compression_ratio
-            latent_video_start_idx = min(latent_video_start_idx, latent_num_frames - 1)
-            # Apply video conditioning starting from the specified frame index
-            # Video frames are placed starting at latent_video_start_idx
-            num_video_frames_to_use = latent_num_frames - latent_video_start_idx
-            init_latents[:, :, latent_video_start_idx:, :, :] = video_latents[:, :, :num_video_frames_to_use, :, :]
-            # Set conditioning mask for video frames
-            # strength=1.0 means fully conditioned (no denoising), strength=0.0 means fully denoised
-            conditioning_mask[:, :, latent_video_start_idx:] = video_conditioning_strength
-        # Handle image conditioning for frame 0
-        if image is not None:
-            if isinstance(generator, list):
-                image_latents = [
-                    retrieve_latents(
-                        self.vae.encode(
-                            image[i].unsqueeze(0).unsqueeze(2)
-                            .to(device=self.vae.device, dtype=self.vae.dtype)
-                            .contiguous()
-                        ),
-                        generator[i],
-                        "argmax",
-                    )
-                    for i in range(batch_size)
-                ]
-            else:
-                image_latents = [
-                    retrieve_latents(self.vae.encode(img.unsqueeze(0).unsqueeze(2).to(device=self.vae.device, dtype=self.vae.dtype).contiguous()), generator, "argmax")
-                    for img in image
-                ]
-            image_latents = torch.cat(image_latents, dim=0).to(dtype)
-            image_latents = self._normalize_latents(image_latents, self.vae.latents_mean, self.vae.latents_std)
-            # Replace frame 0 with image latents (face appearance)
-            init_latents[:, :, 0:1, :, :] = image_latents
-            # Frame 0 is fully conditioned
-            conditioning_mask[:, :, 0] = 1.0
-            # If no video conditioning, repeat image for all frames (image-to-video mode)
-            if video is None:
-                init_latents = image_latents.repeat(1, 1, latent_num_frames, 1, 1)
-        # Generate noise
-        noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
-        # Blend: conditioned regions keep init_latents, unconditioned regions get noise
-        latents = init_latents * conditioning_mask + noise * (1 - conditioning_mask)
-        # Pack for transformer
-        conditioning_mask = self._pack_latents(
-            conditioning_mask, self.transformer_spatial_patch_size, self.transformer_temporal_patch_size
-        ).squeeze(-1)
-        latents = self._pack_latents(
-            latents, self.transformer_spatial_patch_size, self.transformer_temporal_patch_size
-        )
-        return latents, conditioning_mask
     def prepare_audio_latents(
         self,
         batch_size: int = 1,
         num_channels_latents: int = 8,
         num_mel_bins: int = 64,
-        num_frames: int = 121,
-        frame_rate: float = 25.0,
-        sampling_rate: int = 16000,
-        hop_length: int = 160,
         dtype: Optional[torch.dtype] = None,
         device: Optional[torch.device] = None,
         generator: Optional[torch.Generator] = None,
-        audio_input: Optional[Union[str, torch.Tensor]] = None,
         latents: Optional[torch.Tensor] = None,
     ) -> Tuple[torch.Tensor, int, Optional[torch.Tensor]]:
-        duration_s = num_frames / frame_rate
-        latents_per_second = (
-            float(sampling_rate) / float(hop_length) / float(self.audio_vae_temporal_compression_ratio)
-        )
-        target_length = round(duration_s * latents_per_second)
         if latents is not None:
-            return latents.to(device=device, dtype=dtype), target_length, None
-        latent_mel_bins = num_mel_bins // self.audio_vae_mel_compression_ratio
         if audio_input is not None:
-            mel_spec = self._preprocess_audio(audio_input, sampling_rate).to(device=device)
             mel_spec = mel_spec.to(dtype=self.audio_vae.dtype)
-            init_latents = self.audio_vae.encode(mel_spec).latent_dist.sample(generator)
-            init_latents = init_latents.to(dtype=dtype)
-            latent_channels = init_latents.shape[1]
-            latent_freq = init_latents.shape[3]
-            init_latents_patched = self._patchify_audio_latents(init_latents)
-            init_latents_patched = self._normalize_audio_latents(
-                init_latents_patched, self.audio_vae.latents_mean, self.audio_vae.latents_std
-            )
-            init_latents = self._unpatchify_audio_latents(init_latents_patched, latent_channels, latent_freq)
-            current_len = init_latents.shape[2]
-            if current_len < target_length:
-                padding = target_length - current_len
-                init_latents = torch.nn.functional.pad(init_latents, (0, 0, 0, padding))
-            elif current_len > target_length:
-                init_latents = init_latents[:, :, :target_length, :]
-            noise = randn_tensor(init_latents.shape, generator=generator, device=device, dtype=dtype)
-            if init_latents.shape[0] != batch_size:
-                init_latents = init_latents.repeat(batch_size, 1, 1, 1)
-                noise = noise.repeat(batch_size, 1, 1, 1)
-            packed_noise = self._pack_audio_latents(noise)
-            return packed_noise, target_length, init_latents
-        shape = (batch_size, num_channels_latents, target_length, latent_mel_bins)
         latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
         latents = self._pack_audio_latents(latents)
-        return latents, target_length, None
-    # ==================== Properties ====================
     @property
     def guidance_scale(self):
         return self._guidance_scale
@@ -921,37 +943,25 @@ class LTX2AvatarPipeline(DiffusionPipeline, FromSingleFileMixin, LTXVideoLoraLoa
     def interrupt(self):
         return self._interrupt
-    def _get_audio_duration(self, audio: Union[str, torch.Tensor], sample_rate: int) -> float:
-        if isinstance(audio, str):
-            info = torchaudio.info(audio)
-            return info.num_frames / info.sample_rate
-        else:
-            num_samples = audio.shape[-1]
-            return num_samples / sample_rate
-    # ==================== Main Call ====================
     @torch.no_grad()
     @replace_example_docstring(EXAMPLE_DOC_STRING)
     def __call__(
         self,
-        image: PipelineImageInput = None,
-        video: Optional[Union[str, List[Image.Image], torch.Tensor]] = None,
-        video_conditioning_strength: float = 1.0,
-        video_conditioning_frame_idx: int = 1,
         audio: Optional[Union[str, torch.Tensor]] = None,
         prompt: Union[str, List[str]] = None,
         negative_prompt: Optional[Union[str, List[str]]] = None,
         height: int = 512,
         width: int = 768,
-        num_frames: Optional[int] = None,
         max_frames: int = 257,
         frame_rate: float = 24.0,
         num_inference_steps: int = 40,
-        timesteps: List[int] = None,
         sigmas: Optional[List[float]] = None,
         guidance_scale: float = 4.0,
         guidance_rescale: float = 0.0,
         num_videos_per_prompt: Optional[int] = 1,
         generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
         latents: Optional[torch.Tensor] = None,
@@ -969,50 +979,20 @@ class LTX2AvatarPipeline(DiffusionPipeline, FromSingleFileMixin, LTXVideoLoraLoa
         callback_on_step_end_tensor_inputs: List[str] = ["latents"],
         max_sequence_length: int = 1024,
     ):
-        r"""
-        Generate avatar video with audio and optional video conditioning.
-        Args:
-            image (`PipelineImageInput`):
-                The input image (face/appearance) to condition frame 0.
-            video (`str`, `List[PIL.Image]`, or `torch.Tensor`, *optional*):
-                Reference video for motion conditioning. Can be:
-                - Path to a video file
-                - List of PIL Images
-                - Tensor of shape (F, H, W, C) or (F, C, H, W)
-            video_conditioning_strength (`float`, *optional*, defaults to 1.0):
-                How strongly to condition on the reference video (0.0-1.0).
-                1.0 = fully conditioned, 0.0 = no conditioning.
-            video_conditioning_frame_idx (`int`, *optional*, defaults to 1):
-                Frame index where video conditioning starts (in pixel/frame space).
-                - 0: Video conditioning replaces all frames including frame 0
-                - 1: Frame 0 is image-conditioned, frames 1+ are video-conditioned (default for face-swap)
-                - N: Frames 0 to N-1 are image/noise, frames N+ are video-conditioned
-            audio (`str` or `torch.Tensor`, *optional*):
-                Audio for lip-sync. Can be path to audio/video file or waveform tensor.
-            prompt (`str` or `List[str]`, *optional*):
-                Text prompt. For face-swap, include "head_swap" trigger.
-        Examples:
-        Returns:
-            [`LTX2PipelineOutput`] or `tuple`: Generated video and audio.
-        """
         if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
             callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
-        # Calculate num_frames from audio duration if not provided
-        if num_frames is None:
-            if audio is not None:
-                audio_duration = self._get_audio_duration(audio, self.audio_sampling_rate)
-                calculated_frames = int(audio_duration * frame_rate) + 1
-                num_frames = min(calculated_frames, max_frames)
-                num_frames = ((num_frames - 1) // self.vae_temporal_compression_ratio) * self.vae_temporal_compression_ratio + 1
-                num_frames = max(num_frames, 9)
-                logger.info(f"Audio duration: {audio_duration:.2f}s -> num_frames: {num_frames}")
-            else:
-                num_frames = 121
         self.check_inputs(
             prompt=prompt,
             height=height,
@@ -1030,6 +1010,7 @@ class LTX2AvatarPipeline(DiffusionPipeline, FromSingleFileMixin, LTXVideoLoraLoa
         self._interrupt = False
         self._current_timestep = None
         if prompt is not None and isinstance(prompt, str):
             batch_size = 1
         elif prompt is not None and isinstance(prompt, list):
@@ -1037,9 +1018,16 @@ class LTX2AvatarPipeline(DiffusionPipeline, FromSingleFileMixin, LTXVideoLoraLoa
         else:
             batch_size = prompt_embeds.shape[0]
         device = self._execution_device
-        # Encode prompts
         (
             prompt_embeds,
             prompt_attention_mask,
@@ -1066,48 +1054,67 @@ class LTX2AvatarPipeline(DiffusionPipeline, FromSingleFileMixin, LTXVideoLoraLoa
             prompt_embeds, additive_attention_mask, additive_mask=True
         )
-        # Preprocess image
-        if latents is None and image is not None:
-            image = self.video_processor.preprocess(image, height=height, width=width)
-            image = image.to(device=device, dtype=self.vae.dtype)
-        # Preprocess video conditioning
-        video_tensor = None
-        if video is not None:
-            video_tensor = self._load_video_frames(
-                video=video,
-                height=height,
-                width=width,
-                num_frames=num_frames,
-                device=device,
-                dtype=self.vae.dtype,
-            )
-        # Prepare latents with video conditioning
         num_channels_latents = self.transformer.config.in_channels
-        latents, conditioning_mask = self.prepare_latents(
-            image=image,
-            video=video_tensor,
-            video_conditioning_strength=video_conditioning_strength,
-            video_conditioning_frame_idx=video_conditioning_frame_idx,
-            batch_size=batch_size * num_videos_per_prompt,
-            num_channels_latents=num_channels_latents,
-            height=height,
-            width=width,
-            num_frames=num_frames,
-            dtype=torch.float32,
-            device=device,
-            generator=generator,
-            latents=latents,
         )
         if self.do_classifier_free_guidance:
             conditioning_mask = torch.cat([conditioning_mask, conditioning_mask])
-        # Prepare audio latents
         num_mel_bins = self.audio_vae.config.mel_bins if getattr(self, "audio_vae", None) is not None else 64
         latent_mel_bins = num_mel_bins // self.audio_vae_mel_compression_ratio
         num_channels_latents_audio = (
             self.audio_vae.config.latent_channels if getattr(self, "audio_vae", None) is not None else 8
         )
@@ -1115,30 +1122,20 @@ class LTX2AvatarPipeline(DiffusionPipeline, FromSingleFileMixin, LTXVideoLoraLoa
         audio_latents, audio_num_frames, clean_audio_latents = self.prepare_audio_latents(
             batch_size * num_videos_per_prompt,
             num_channels_latents=num_channels_latents_audio,
             num_mel_bins=num_mel_bins,
-            num_frames=num_frames,
-            frame_rate=frame_rate,
-            sampling_rate=self.audio_sampling_rate,
-            hop_length=self.audio_hop_length,
             dtype=torch.float32,
             device=device,
             generator=generator,
             latents=audio_latents,
             audio_input=audio,
         )
-        packed_clean_audio_latents = None
-        if clean_audio_latents is not None:
-            packed_clean_audio_latents = self._pack_audio_latents(clean_audio_latents)
-        latent_num_frames = (num_frames - 1) // self.vae_temporal_compression_ratio + 1
-        latent_height = height // self.vae_spatial_compression_ratio
-        latent_width = width // self.vae_spatial_compression_ratio
-        video_sequence_length = latent_num_frames * latent_height * latent_width
-        if sigmas is None:
-            sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps)
         mu = calculate_shift(
             video_sequence_length,
             self.scheduler.config.get("base_image_seq_len", 1024),
@@ -1179,7 +1176,7 @@ class LTX2AvatarPipeline(DiffusionPipeline, FromSingleFileMixin, LTXVideoLoraLoa
             audio_latents.shape[0], audio_num_frames, audio_latents.device
         )
-        # Denoising loop
         with self.progress_bar(total=num_inference_steps) as progress_bar:
             for i, t in enumerate(timesteps):
                 if self.interrupt:
@@ -1187,6 +1184,7 @@ class LTX2AvatarPipeline(DiffusionPipeline, FromSingleFileMixin, LTXVideoLoraLoa
                 self._current_timestep = t
                 if packed_clean_audio_latents is not None:
                     audio_latents_input = packed_clean_audio_latents.to(dtype=prompt_embeds.dtype)
                 else:
@@ -1200,7 +1198,7 @@ class LTX2AvatarPipeline(DiffusionPipeline, FromSingleFileMixin, LTXVideoLoraLoa
                 audio_latent_model_input = audio_latent_model_input.to(prompt_embeds.dtype)
                 timestep = t.expand(latent_model_input.shape[0])
-                video_timestep = timestep.unsqueeze(-1) * (1 - conditioning_mask)
                 if packed_clean_audio_latents is not None:
                     audio_timestep = torch.zeros_like(timestep)
@@ -1224,6 +1222,7 @@ class LTX2AvatarPipeline(DiffusionPipeline, FromSingleFileMixin, LTXVideoLoraLoa
                         audio_num_frames=audio_num_frames,
                         video_coords=video_coords,
                         audio_coords=audio_coords,
                         attention_kwargs=attention_kwargs,
                         return_dict=False,
                     )
@@ -1249,32 +1248,17 @@ class LTX2AvatarPipeline(DiffusionPipeline, FromSingleFileMixin, LTXVideoLoraLoa
                             noise_pred_audio, noise_pred_audio_text, guidance_rescale=self.guidance_rescale
                         )
-                noise_pred_video = self._unpack_latents(
-                    noise_pred_video,
-                    latent_num_frames,
-                    latent_height,
-                    latent_width,
-                    self.transformer_spatial_patch_size,
-                    self.transformer_temporal_patch_size,
-                )
-                latents = self._unpack_latents(
-                    latents,
-                    latent_num_frames,
-                    latent_height,
-                    latent_width,
-                    self.transformer_spatial_patch_size,
-                    self.transformer_temporal_patch_size,
-                )
-                noise_pred_video = noise_pred_video[:, :, 1:]
-                noise_latents = latents[:, :, 1:]
-                pred_latents = self.scheduler.step(noise_pred_video, t, noise_latents, return_dict=False)[0]
-                latents = torch.cat([latents[:, :, :1], pred_latents], dim=2)
-                latents = self._pack_latents(
-                    latents, self.transformer_spatial_patch_size, self.transformer_temporal_patch_size
-                )
                 if packed_clean_audio_latents is None:
                     audio_latents = audio_scheduler.step(noise_pred_audio, t, audio_latents, return_dict=False)[0]
@@ -1283,6 +1267,7 @@ class LTX2AvatarPipeline(DiffusionPipeline, FromSingleFileMixin, LTXVideoLoraLoa
                     for k in callback_on_step_end_tensor_inputs:
                         callback_kwargs[k] = locals()[k]
                     callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
                     latents = callback_outputs.pop("latents", latents)
                     prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
@@ -1292,7 +1277,6 @@ class LTX2AvatarPipeline(DiffusionPipeline, FromSingleFileMixin, LTXVideoLoraLoa
                 if XLA_AVAILABLE:
                     xm.mark_step()
-        # Decode
         latents = self._unpack_latents(
             latents,
             latent_num_frames,
@@ -1305,25 +1289,22 @@ class LTX2AvatarPipeline(DiffusionPipeline, FromSingleFileMixin, LTXVideoLoraLoa
             latents, self.vae.latents_mean, self.vae.latents_std, self.vae.config.scaling_factor
         )
-        if clean_audio_latents is not None:
-            latent_channels = clean_audio_latents.shape[1]
-            latent_freq = clean_audio_latents.shape[3]
-            audio_patched = self._patchify_audio_latents(clean_audio_latents)
-            audio_patched = self._denormalize_audio_latents(
-                audio_patched, self.audio_vae.latents_mean, self.audio_vae.latents_std
-            )
-            audio_latents_for_decode = self._unpatchify_audio_latents(audio_patched, latent_channels, latent_freq)
         else:
-            audio_latents_for_decode = self._denormalize_audio_latents(
-                audio_latents, self.audio_vae.latents_mean, self.audio_vae.latents_std
-            )
-            audio_latents_for_decode = self._unpack_audio_latents(
-                audio_latents_for_decode, audio_num_frames, num_mel_bins=latent_mel_bins
-            )
         if output_type == "latent":
             video = latents
-            audio_output = audio_latents_for_decode
         else:
             latents = latents.to(prompt_embeds.dtype)
@@ -1348,13 +1329,13 @@ class LTX2AvatarPipeline(DiffusionPipeline, FromSingleFileMixin, LTXVideoLoraLoa
             video = self.vae.decode(latents, timestep, return_dict=False)[0]
             video = self.video_processor.postprocess_video(video, output_type=output_type)
-            audio_latents_for_decode = audio_latents_for_decode.to(self.audio_vae.dtype)
-            generated_mel_spectrograms = self.audio_vae.decode(audio_latents_for_decode, return_dict=False)[0]
-            audio_output = self.vocoder(generated_mel_spectrograms)
         self.maybe_free_model_hooks()
         if not return_dict:
-            return (video, audio_output)
-        return LTX2PipelineOutput(frames=video, audio=audio_output)

 # See the License for the specific language governing permissions and
 # limitations under the License.
 import copy
 import inspect
+from dataclasses import dataclass
 from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 import numpy as np
+import PIL.Image
 import torch
 import torchaudio
 import torchaudio.transforms as T
 from transformers import Gemma3ForConditionalGeneration, GemmaTokenizer, GemmaTokenizerFast
 from diffusers.callbacks import MultiPipelineCallbacks, PipelineCallback
 from diffusers.loaders import FromSingleFileMixin, LTXVideoLoraLoaderMixin
+from diffusers.models.autoencoders import (
+    AutoencoderKLLTX2Audio,
+    AutoencoderKLLTX2Video,
+)
 from diffusers.models.transformers import LTX2VideoTransformer3DModel
 from diffusers.schedulers import FlowMatchEulerDiscreteScheduler
+from diffusers.utils import (
+    is_torch_xla_available,
+    logging,
+    replace_example_docstring,
+)
 from diffusers.utils.torch_utils import randn_tensor
 from diffusers.video_processor import VideoProcessor
 from diffusers.pipelines.pipeline_utils import DiffusionPipeline
 from diffusers.pipelines.ltx2.connectors import LTX2TextConnectors
 from diffusers.pipelines.ltx2.pipeline_output import LTX2PipelineOutput
 from diffusers.pipelines.ltx2.vocoder import LTX2Vocoder
 if is_torch_xla_available():
     import torch_xla.core.xla_model as xm
     XLA_AVAILABLE = True
 else:
     XLA_AVAILABLE = False
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 EXAMPLE_DOC_STRING = """
     Examples:
         ```py
         >>> import torch
+        >>> from diffusers import LTX2ConditionPipeline
+        >>> from diffusers.pipelines.ltx2.export_utils import encode_video
+        >>> from diffusers.pipelines.ltx2.pipeline_ltx2_condition import LTX2VideoCondition
         >>> from diffusers.utils import load_image
+        >>> pipe = LTX2ConditionPipeline.from_pretrained("Lightricks/LTX-2", torch_dtype=torch.bfloat16)
+        >>> pipe.enable_model_cpu_offload()
+        >>> first_image = load_image(
+        ...     "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/flf2v_input_first_frame.png"
         ... )
+        >>> last_image = load_image(
+        ...     "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/flf2v_input_last_frame.png"
         ... )
+        >>> first_cond = LTX2VideoCondition(frames=first_image, index=0, strength=1.0)
+        >>> last_cond = LTX2VideoCondition(frames=last_image, index=-1, strength=1.0)
+        >>> conditions = [first_cond, last_cond]
+        >>> prompt = "CG animation style, a small blue bird takes off from the ground, flapping its wings."
+        >>> negative_prompt = "worst quality, inconsistent motion, blurry, jittery, distorted, static"
+        >>> frame_rate = 24.0
+        >>> video = pipe(
+        ...     conditions=conditions,
+        ...     prompt=prompt,
+        ...     negative_prompt=negative_prompt,
+        ...     width=768,
+        ...     height=512,
         ...     num_frames=121,
+        ...     frame_rate=frame_rate,
+        ...     num_inference_steps=40,
+        ...     guidance_scale=4.0,
+        ...     output_type="np",
         ...     return_dict=False,
         ... )
+        >>> video = (video * 255).round().astype("uint8")
+        >>> video = torch.from_numpy(video)
+        >>> encode_video(
+        ...     video[0],
+        ...     fps=frame_rate,
+        ...     audio=audio[0].float().cpu(),
+        ...     audio_sample_rate=pipe.vocoder.config.output_sampling_rate,  # should be 24000
+        ...     output_path="video.mp4",
+        ... )
         ```
 """
+@dataclass
+class LTX2VideoCondition:
+    """
+    Defines a single frame-conditioning item for LTX-2 Video - a single frame or a sequence of frames.
+    Attributes:
+        frames (`PIL.Image.Image` or `List[PIL.Image.Image]` or `np.ndarray` or `torch.Tensor`):
+            The image (or video) to condition the video on. Accepts any type that can be handled by
+            VideoProcessor.preprocess_video.
+        index (`int`, defaults to `0`):
+            The index at which the image or video will conditionally affect the video generation.
+        strength (`float`, defaults to `1.0`):
+            The strength of the conditioning effect. A value of `1.0` means the conditioning effect is fully applied.
+    """
+    frames: Union[PIL.Image.Image, List[PIL.Image.Image], np.ndarray, torch.Tensor]
+    index: int = 0
+    strength: float = 1.0
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
 def retrieve_latents(
     encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
 ):
         raise AttributeError("Could not access latents of provided encoder_output")
+# Copied from diffusers.pipelines.flux.pipeline_flux.calculate_shift
 def calculate_shift(
     image_seq_len,
     base_seq_len: int = 256,
     return mu
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
 def retrieve_timesteps(
     scheduler,
     num_inference_steps: Optional[int] = None,
     sigmas: Optional[List[float]] = None,
     **kwargs,
 ):
+    r"""
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
+            must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
+            `num_inference_steps` and `sigmas` must be `None`.
+        sigmas (`List[float]`, *optional*):
+            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
+            `num_inference_steps` and `timesteps` must be `None`.
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
     if timesteps is not None and sigmas is not None:
+        raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
     if timesteps is not None:
         accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
         if not accepts_timesteps:
             raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
             )
         scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
         timesteps = scheduler.timesteps
         accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
         if not accept_sigmas:
             raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" sigmas schedules. Please check whether you are using the correct scheduler."
             )
         scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
         timesteps = scheduler.timesteps
     return timesteps, num_inference_steps
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.rescale_noise_cfg
 def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
+    r"""
+    Rescales `noise_cfg` tensor based on `guidance_rescale` to improve image quality and fix overexposure. Based on
+    Section 3.4 from [Common Diffusion Noise Schedules and Sample Steps are
+    Flawed](https://huggingface.co/papers/2305.08891).
+    Args:
+        noise_cfg (`torch.Tensor`):
+            The predicted noise tensor for the guided diffusion process.
+        noise_pred_text (`torch.Tensor`):
+            The predicted noise tensor for the text-guided diffusion process.
+        guidance_rescale (`float`, *optional*, defaults to 0.0):
+            A rescale factor applied to the noise predictions.
+    Returns:
+        noise_cfg (`torch.Tensor`): The rescaled noise prediction tensor.
+    """
     std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
     std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
     noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
     return noise_cfg
+class LTX2ConditionPipeline(DiffusionPipeline, FromSingleFileMixin, LTXVideoLoraLoaderMixin):
     r"""
+    Pipeline for video generation which allows image conditions to be inserted at arbitary parts of the video.
+    Reference: https://github.com/Lightricks/LTX-Video
+    TODO
     """
     model_cpu_offload_seq = "text_encoder->connectors->transformer->vae->audio_vae->vocoder"
         self.vae_temporal_compression_ratio = (
             self.vae.temporal_compression_ratio if getattr(self, "vae", None) is not None else 8
         )
+        # TODO: check whether the MEL compression ratio logic here is corrct
         self.audio_vae_mel_compression_ratio = (
             self.audio_vae.mel_compression_ratio if getattr(self, "audio_vae", None) is not None else 4
         )
             self.tokenizer.model_max_length if getattr(self, "tokenizer", None) is not None else 1024
         )
     @staticmethod
+    # Copied from diffusers.pipelines.ltx2.pipeline_ltx2.LTX2Pipeline._pack_text_embeds
     def _pack_text_embeds(
         text_hidden_states: torch.Tensor,
         sequence_lengths: torch.Tensor,
         normalized_hidden_states = normalized_hidden_states.to(dtype=original_dtype)
         return normalized_hidden_states
+    # Copied from diffusers.pipelines.ltx2.pipeline_ltx2.LTX2Pipeline._get_gemma_prompt_embeds
     def _get_gemma_prompt_embeds(
         self,
         prompt: Union[str, List[str]],
         return prompt_embeds, prompt_attention_mask
+    # Copied from diffusers.pipelines.ltx2.pipeline_ltx2.LTX2Pipeline.encode_prompt
     def encode_prompt(
         self,
         prompt: Union[str, List[str]],
             if prompt is not None and type(prompt) is not type(negative_prompt):
                 raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
                 )
             elif batch_size != len(negative_prompt):
                 raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
                 )
             negative_prompt_embeds, negative_prompt_attention_mask = self._get_gemma_prompt_embeds(
             k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
         ):
             raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
             )
         if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
         elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
         elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
             raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
         if negative_prompt_embeds is not None and negative_prompt_attention_mask is None:
             raise ValueError("Must provide `negative_prompt_attention_mask` when specifying `negative_prompt_embeds`.")
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+            if prompt_attention_mask.shape != negative_prompt_attention_mask.shape:
+                raise ValueError(
+                    "`prompt_attention_mask` and `negative_prompt_attention_mask` must have the same shape when passed directly, but"
+                    f" got: `prompt_attention_mask` {prompt_attention_mask.shape} != `negative_prompt_attention_mask`"
+                    f" {negative_prompt_attention_mask.shape}."
+                )
     @staticmethod
+    # Copied from diffusers.pipelines.ltx2.pipeline_ltx2.LTX2Pipeline._pack_latents
     def _pack_latents(latents: torch.Tensor, patch_size: int = 1, patch_size_t: int = 1) -> torch.Tensor:
         batch_size, num_channels, num_frames, height, width = latents.shape
         post_patch_num_frames = num_frames // patch_size_t
         return latents
     @staticmethod
+    # Copied from diffusers.pipelines.ltx2.pipeline_ltx2.LTX2Pipeline._unpack_latents
     def _unpack_latents(
         latents: torch.Tensor, num_frames: int, height: int, width: int, patch_size: int = 1, patch_size_t: int = 1
     ) -> torch.Tensor:
         return latents
     @staticmethod
+    # Copied from diffusers.pipelines.ltx2.pipeline_ltx2.LTX2Pipeline._denormalize_latents
     def _denormalize_latents(
         latents: torch.Tensor, latents_mean: torch.Tensor, latents_std: torch.Tensor, scaling_factor: float = 1.0
     ) -> torch.Tensor:
         latents = latents * latents_std / scaling_factor + latents_mean
         return latents
     @staticmethod
+    # Copied from diffusers.pipelines.ltx2.pipeline_ltx2.LTX2Pipeline._create_noised_state
+    def _create_noised_state(
+        latents: torch.Tensor, noise_scale: Union[float, torch.Tensor], generator: Optional[torch.Generator] = None
+    ):
+        noise = randn_tensor(latents.shape, generator=generator, device=latents.device, dtype=latents.dtype)
+        noised_latents = noise_scale * noise + (1 - noise_scale) * latents
+        return noised_latents
+    @staticmethod
+    # Copied from diffusers.pipelines.ltx2.pipeline_ltx2.LTX2Pipeline._pack_audio_latents
     def _pack_audio_latents(
         latents: torch.Tensor, patch_size: Optional[int] = None, patch_size_t: Optional[int] = None
     ) -> torch.Tensor:
         return latents
     @staticmethod
+    # Copied from diffusers.pipelines.ltx2.pipeline_ltx2.LTX2Pipeline._unpack_audio_latents
     def _unpack_audio_latents(
         latents: torch.Tensor,
         latent_length: int,
         return latents
     @staticmethod
+    # Copied from diffusers.pipelines.ltx2.pipeline_ltx2.LTX2Pipeline._normalize_audio_latents
+    def _normalize_audio_latents(latents: torch.Tensor, latents_mean: torch.Tensor, latents_std: torch.Tensor):
         latents_mean = latents_mean.to(latents.device, latents.dtype)
         latents_std = latents_std.to(latents.device, latents.dtype)
+        return (latents - latents_mean) / latents_std
     @staticmethod
+    # Copied from diffusers.pipelines.ltx2.pipeline_ltx2.LTX2Pipeline._denormalize_audio_latents
+    def _denormalize_audio_latents(latents: torch.Tensor, latents_mean: torch.Tensor, latents_std: torch.Tensor):
         latents_mean = latents_mean.to(latents.device, latents.dtype)
         latents_std = latents_std.to(latents.device, latents.dtype)
+        return (latents * latents_std) + latents_mean
+    # Copied from diffusers.pipelines.ltx.pipeline_ltx_condition.LTXConditionPipeline.trim_conditioning_sequence
+    def trim_conditioning_sequence(self, start_frame: int, sequence_num_frames: int, target_num_frames: int) -> int:
+        scale_factor = self.vae_temporal_compression_ratio
+        num_frames = min(sequence_num_frames, target_num_frames - start_frame)
+        num_frames = (num_frames - 1) // scale_factor * scale_factor + 1
+        return num_frames
+    def preprocess_conditions(
+        self,
+        conditions: Optional[Union[LTX2VideoCondition, List[LTX2VideoCondition]]] = None,
+        height: int = 512,
+        width: int = 768,
+        num_frames: int = 121,
+        device: Optional[torch.device] = None,
+        index_type: str = "latent",
+    ) -> Tuple[List[torch.Tensor], List[float], List[int]]:
+        conditioning_frames, conditioning_strengths, conditioning_indices = [], [], []
+        if conditions is None:
+            conditions = []
+        if isinstance(conditions, LTX2VideoCondition):
+            conditions = [conditions]
+        frame_scale_factor = self.vae_temporal_compression_ratio
+        latent_num_frames = (num_frames - 1) // frame_scale_factor + 1
+        for i, condition in enumerate(conditions):
+            if isinstance(condition.frames, PIL.Image.Image):
+                video_like_cond = [condition.frames]
+            elif isinstance(condition.frames, np.ndarray) and condition.frames.ndim == 3:
+                video_like_cond = np.expand_dims(condition.frames, axis=0)
+            elif isinstance(condition.frames, torch.Tensor) and condition.frames.ndim == 3:
+                video_like_cond = condition.frames.unsqueeze(0)
+            else:
+                video_like_cond = condition.frames
+            condition_pixels = self.video_processor.preprocess_video(video_like_cond, height, width)
+            latent_start_idx = condition.index
+            if latent_start_idx < 0:
+                latent_start_idx = latent_start_idx % latent_num_frames
+            if latent_start_idx >= latent_num_frames:
+                logger.warning(
+                    f"The starting latent index {latent_start_idx} of condition {i} is too big for the specified number"
+                    f" of latent frames {latent_num_frames}. This condition will be skipped."
+                )
+                continue
+            cond_num_frames = condition_pixels.size(2)
+            start_idx = max((latent_start_idx - 1) * frame_scale_factor + 1, 0)
+            truncated_cond_frames = self.trim_conditioning_sequence(start_idx, cond_num_frames, num_frames)
+            condition_pixels = condition_pixels[:, :, :truncated_cond_frames]
+            conditioning_frames.append(condition_pixels.to(dtype=self.vae.dtype, device=device))
+            conditioning_strengths.append(condition.strength)
+            conditioning_indices.append(latent_start_idx)
+        return conditioning_frames, conditioning_strengths, conditioning_indices
+    def apply_visual_conditioning(
+        self,
+        latents: torch.Tensor,
+        conditioning_mask: torch.Tensor,
+        condition_latents: List[torch.Tensor],
+        condition_strengths: List[float],
+        condition_indices: List[int],
+        latent_height: int,
+        latent_width: int,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        clean_latents = torch.zeros_like(latents)
+        for cond, strength, latent_idx in zip(condition_latents, condition_strengths, condition_indices):
+            num_cond_tokens = cond.size(1)
+            start_token_idx = latent_idx * latent_height * latent_width
+            end_token_idx = start_token_idx + num_cond_tokens
+            latents[:, start_token_idx:end_token_idx] = cond
+            conditioning_mask[:, start_token_idx:end_token_idx] = strength
+            clean_latents[:, start_token_idx:end_token_idx] = cond
+        return latents, conditioning_mask, clean_latents
+    def prepare_latents(
+        self,
+        conditions: Optional[Union[LTX2VideoCondition, List[LTX2VideoCondition]]] = None,
+        batch_size: int = 1,
+        num_channels_latents: int = 128,
+        height: int = 512,
+        width: int = 768,
+        num_frames: int = 121,
+        noise_scale: float = 1.0,
+        dtype: Optional[torch.dtype] = None,
+        device: Optional[torch.device] = None,
+        generator: Optional[torch.Generator] = None,
+        latents: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        latent_height = height // self.vae_spatial_compression_ratio
+        latent_width = width // self.vae_spatial_compression_ratio
+        latent_num_frames = (num_frames - 1) // self.vae_temporal_compression_ratio + 1
+        shape = (batch_size, num_channels_latents, latent_num_frames, latent_height, latent_width)
+        mask_shape = (batch_size, 1, latent_num_frames, latent_height, latent_width)
+        if latents is not None:
+            if latents.ndim == 5:
+                latents = self._normalize_latents(
+                    latents, self.vae.latents_mean, self.vae.latents_std, self.vae.config.scaling_factor
+                )
+        else:
+            latents = torch.zeros(shape, device=device, dtype=dtype)
+        conditioning_mask = latents.new_zeros(mask_shape)
+        if latents.ndim == 5:
+            latents = self._pack_latents(
+                latents, self.transformer_spatial_patch_size, self.transformer_temporal_patch_size
+            )
+        conditioning_mask = self._pack_latents(
+            conditioning_mask, self.transformer_spatial_patch_size, self.transformer_temporal_patch_size
+        )
+        if latents.ndim != 3 or latents.shape[:2] != conditioning_mask.shape[:2]:
+            raise ValueError(
+                f"Provided `latents` tensor has shape {latents.shape}, but the expected shape is {conditioning_mask.shape[:2] + (num_channels_latents,)}."
+            )
+        if isinstance(generator, list):
+            logger.warning(
+                f"{self.__class__.__name__} does not support using a list of generators. The first generator in the"
+                f" list will be used for all (pseudo-)random operations."
+            )
+            generator = generator[0]
+        condition_frames, condition_strengths, condition_indices = self.preprocess_conditions(
+            conditions, height, width, num_frames, device=device
+        )
+        condition_latents = []
+        for condition_tensor in condition_frames:
+            condition_latent = retrieve_latents(
+                self.vae.encode(condition_tensor), generator=generator, sample_mode="argmax"
+            )
+            condition_latent = self._normalize_latents(
+                condition_latent, self.vae.latents_mean, self.vae.latents_std
+            ).to(device=device, dtype=dtype)
+            condition_latent = self._pack_latents(
+                condition_latent, self.transformer_spatial_patch_size, self.transformer_temporal_patch_size
+            )
+            condition_latents.append(condition_latent)
+        latents, conditioning_mask, clean_latents = self.apply_visual_conditioning(
+            latents,
+            conditioning_mask,
+            condition_latents,
+            condition_strengths,
+            condition_indices,
+            latent_height=latent_height,
+            latent_width=latent_width,
+        )
+        noise = randn_tensor(latents.shape, generator=generator, device=latents.device, dtype=latents.dtype)
+        scaled_mask = (1.0 - conditioning_mask) * noise_scale
+        latents = noise * scaled_mask + latents * (1 - scaled_mask)
+        return latents, conditioning_mask, clean_latents
+    # -------------------- Audio conditioning additions (minimal) --------------------
+    def _get_audio_duration(self, audio: Union[str, torch.Tensor], sample_rate: int) -> float:
+        if isinstance(audio, str):
+            info = torchaudio.info(audio)
+            return info.num_frames / info.sample_rate
+        num_samples = audio.shape[-1]
+        return num_samples / sample_rate
     def _preprocess_audio(self, audio: Union[str, torch.Tensor], target_sample_rate: int) -> torch.Tensor:
         if isinstance(audio, str):
             waveform, sr = torchaudio.load(audio)
         else:
         if sr != target_sample_rate:
             waveform = torchaudio.functional.resample(waveform, sr, target_sample_rate)
+        if waveform.ndim == 1:
+            waveform = waveform.unsqueeze(0)
         if waveform.shape[0] == 1:
             waveform = waveform.repeat(2, 1)
         elif waveform.shape[0] > 2:
             waveform = waveform[:2, :]
+        waveform = waveform.unsqueeze(0)  # [B, 2, samples]
         n_fft = 1024
         mel_transform = T.MelSpectrogram(
             norm="slaney",
         )
+        mel_spec = mel_transform(waveform)  # [B, 2, mel_bins, T]
         mel_spec = torch.log(torch.clamp(mel_spec, min=1e-5))
+        mel_spec = mel_spec.permute(0, 1, 3, 2).contiguous()  # [B, 2, T, mel_bins]
         return mel_spec
+    # Copied from diffusers.pipelines.ltx2.pipeline_ltx2.LTX2Pipeline.prepare_audio_latents (modified minimally)
     def prepare_audio_latents(
         self,
         batch_size: int = 1,
         num_channels_latents: int = 8,
+        audio_latent_length: int = 1,  # 1 is just a dummy value
         num_mel_bins: int = 64,
+        noise_scale: float = 0.0,
         dtype: Optional[torch.dtype] = None,
         device: Optional[torch.device] = None,
         generator: Optional[torch.Generator] = None,
         latents: Optional[torch.Tensor] = None,
+        audio_input: Optional[Union[str, torch.Tensor]] = None,
     ) -> Tuple[torch.Tensor, int, Optional[torch.Tensor]]:
+        """
+        Returns:
+          - packed noisy audio latents [B, S, D]
+          - audio_latent_length
+          - packed clean audio latents [B, S, D] if audio_input is provided, else None
+        """
         if latents is not None:
+            if latents.ndim == 4:
+                latents = self._pack_audio_latents(latents)
+            if latents.ndim != 3:
+                raise ValueError(
+                    f"Provided `latents` tensor has shape {latents.shape}, but the expected shape is [batch_size, num_seq, num_features]."
+                )
+            latents = self._normalize_audio_latents(latents, self.audio_vae.latents_mean, self.audio_vae.latents_std)
+            latents = self._create_noised_state(latents, noise_scale, generator)
+            return latents.to(device=device, dtype=dtype), audio_latent_length, None
+        # If audio input is provided, encode to clean latents and return both clean and a dummy noisy tensor
         if audio_input is not None:
+            mel_spec = self._preprocess_audio(audio_input, self.audio_sampling_rate).to(device=device)
             mel_spec = mel_spec.to(dtype=self.audio_vae.dtype)
+            clean_4d = self.audio_vae.encode(mel_spec).latent_dist.sample(generator)  # [B, C, L, F]
+            # pad/trim to audio_latent_length
+            cur_len = clean_4d.shape[2]
+            if cur_len < audio_latent_length:
+                pad = audio_latent_length - cur_len
+                clean_4d = torch.nn.functional.pad(clean_4d, (0, 0, 0, pad))
+            elif cur_len > audio_latent_length:
+                clean_4d = clean_4d[:, :, :audio_latent_length, :]
+            if clean_4d.shape[0] != batch_size:
+                clean_4d = clean_4d.repeat(batch_size, 1, 1, 1)
+            clean_packed = self._pack_audio_latents(clean_4d)  # [B, S, D]
+            clean_packed = clean_packed.to(dtype=dtype)
+            clean_packed = self._normalize_audio_latents(
+                clean_packed, self.audio_vae.latents_mean, self.audio_vae.latents_std
+            )
+            noisy = randn_tensor(clean_packed.shape, generator=generator, device=device, dtype=dtype)
+            noisy = self._create_noised_state(noisy, noise_scale, generator=None)  # keep same scaling semantics
+            return noisy, audio_latent_length, clean_packed
+        latent_mel_bins = num_mel_bins // self.audio_vae_mel_compression_ratio
+        shape = (batch_size, num_channels_latents, audio_latent_length, latent_mel_bins)
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
         latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
         latents = self._pack_audio_latents(latents)
+        return latents, audio_latent_length, None
+    # ------------------------------------------------------------------
     @property
     def guidance_scale(self):
         return self._guidance_scale
     def interrupt(self):
         return self._interrupt
     @torch.no_grad()
     @replace_example_docstring(EXAMPLE_DOC_STRING)
     def __call__(
         self,
+        conditions: Union[LTX2VideoCondition, List[LTX2VideoCondition]] = None,
         audio: Optional[Union[str, torch.Tensor]] = None,
         prompt: Union[str, List[str]] = None,
         negative_prompt: Optional[Union[str, List[str]]] = None,
         height: int = 512,
         width: int = 768,
+        num_frames: Optional[int] = 121,
         max_frames: int = 257,
         frame_rate: float = 24.0,
         num_inference_steps: int = 40,
         sigmas: Optional[List[float]] = None,
+        timesteps: List[int] = None,
         guidance_scale: float = 4.0,
         guidance_rescale: float = 0.0,
+        noise_scale: Optional[float] = None,
         num_videos_per_prompt: Optional[int] = 1,
         generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
         latents: Optional[torch.Tensor] = None,
         callback_on_step_end_tensor_inputs: List[str] = ["latents"],
         max_sequence_length: int = 1024,
     ):
         if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
             callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
+        # Optional: derive num_frames from audio if user passes num_frames=None
+        if num_frames is None and audio is not None:
+            audio_duration = self._get_audio_duration(audio, self.audio_sampling_rate)
+            calculated_frames = int(audio_duration * frame_rate) + 1
+            num_frames = min(calculated_frames, max_frames)
+            num_frames = (
+                (num_frames - 1) // self.vae_temporal_compression_ratio
+            ) * self.vae_temporal_compression_ratio + 1
+            num_frames = max(num_frames, 9)
+        # 1. Check inputs. Raise error if not correct
         self.check_inputs(
             prompt=prompt,
             height=height,
         self._interrupt = False
         self._current_timestep = None
+        # 2. Define call parameters
         if prompt is not None and isinstance(prompt, str):
             batch_size = 1
         elif prompt is not None and isinstance(prompt, list):
         else:
             batch_size = prompt_embeds.shape[0]
+        if conditions is not None and not isinstance(conditions, list):
+            conditions = [conditions]
+        # Infer noise scale: first (largest) sigma value if using custom sigmas, else 1.0
+        if noise_scale is None:
+            noise_scale = sigmas[0] if sigmas is not None else 1.0
         device = self._execution_device
+        # 3. Prepare text embeddings
         (
             prompt_embeds,
             prompt_attention_mask,
             prompt_embeds, additive_attention_mask, additive_mask=True
         )
+        # 4. Prepare latent variables
+        latent_num_frames = (num_frames - 1) // self.vae_temporal_compression_ratio + 1
+        latent_height = height // self.vae_spatial_compression_ratio
+        latent_width = width // self.vae_spatial_compression_ratio
+        if latents is not None:
+            if latents.ndim == 5:
+                logger.info(
+                    "Got latents of shape [batch_size, latent_dim, latent_frames, latent_height, latent_width], `latent_num_frames`, `latent_height`, `latent_width` will be inferred."
+                )
+                _, _, latent_num_frames, latent_height, latent_width = latents.shape
+            elif latents.ndim == 3:
+                logger.warning(
+                    f"You have supplied packed `latents` of shape {latents.shape}, so the latent dims cannot be"
+                    f" inferred. Make sure the supplied `height`, `width`, and `num_frames` are correct."
+                )
+            else:
+                raise ValueError(
+                    f"Provided `latents` tensor has shape {latents.shape}, but the expected shape is either [batch_size, seq_len, num_features] or [batch_size, latent_dim, latent_frames, latent_height, latent_width]."
+                )
+        video_sequence_length = latent_num_frames * latent_height * latent_width
         num_channels_latents = self.transformer.config.in_channels
+        latents, conditioning_mask, clean_latents = self.prepare_latents(
+            conditions,
+            batch_size * num_videos_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            num_frames,
+            noise_scale,
+            torch.float32,
+            device,
+            generator,
+            latents,
         )
         if self.do_classifier_free_guidance:
             conditioning_mask = torch.cat([conditioning_mask, conditioning_mask])
+        duration_s = num_frames / frame_rate
+        audio_latents_per_second = (
+            self.audio_sampling_rate / self.audio_hop_length / float(self.audio_vae_temporal_compression_ratio)
+        )
+        audio_num_frames = round(duration_s * audio_latents_per_second)
+        if audio_latents is not None:
+            if audio_latents.ndim == 4:
+                logger.info(
+                    "Got audio_latents of shape [batch_size, num_channels, audio_length, mel_bins], `audio_num_frames` will be inferred."
+                )
+                _, _, audio_num_frames, _ = audio_latents.shape
+            elif audio_latents.ndim == 3:
+                logger.warning(
+                    f"You have supplied packed `audio_latents` of shape {audio_latents.shape}, so the latent dims"
+                    f" cannot be inferred. Make sure the supplied `num_frames` and `frame_rate` are correct."
+                )
+            else:
+                raise ValueError(
+                    f"Provided `audio_latents` tensor has shape {audio_latents.shape}, but the expected shape is either [batch_size, seq_len, num_features] or [batch_size, num_channels, audio_length, mel_bins]."
+                )
         num_mel_bins = self.audio_vae.config.mel_bins if getattr(self, "audio_vae", None) is not None else 64
         latent_mel_bins = num_mel_bins // self.audio_vae_mel_compression_ratio
         num_channels_latents_audio = (
             self.audio_vae.config.latent_channels if getattr(self, "audio_vae", None) is not None else 8
         )
         audio_latents, audio_num_frames, clean_audio_latents = self.prepare_audio_latents(
             batch_size * num_videos_per_prompt,
             num_channels_latents=num_channels_latents_audio,
+            audio_latent_length=audio_num_frames,
             num_mel_bins=num_mel_bins,
+            noise_scale=noise_scale,
             dtype=torch.float32,
             device=device,
             generator=generator,
             latents=audio_latents,
             audio_input=audio,
         )
+        # clean_audio_latents is packed [B,S,D] if present
+        packed_clean_audio_latents = clean_audio_latents
+        # 5. Prepare timesteps
+        sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps) if sigmas is None else sigmas
         mu = calculate_shift(
             video_sequence_length,
             self.scheduler.config.get("base_image_seq_len", 1024),
             audio_latents.shape[0], audio_num_frames, audio_latents.device
         )
+        # 7. Denoising loop
         with self.progress_bar(total=num_inference_steps) as progress_bar:
             for i, t in enumerate(timesteps):
                 if self.interrupt:
                 self._current_timestep = t
+                # If audio conditioning provided, use clean audio latents directly (packed), and timestep=0
                 if packed_clean_audio_latents is not None:
                     audio_latents_input = packed_clean_audio_latents.to(dtype=prompt_embeds.dtype)
                 else:
                 audio_latent_model_input = audio_latent_model_input.to(prompt_embeds.dtype)
                 timestep = t.expand(latent_model_input.shape[0])
+                video_timestep = timestep.unsqueeze(-1) * (1 - conditioning_mask.squeeze(-1))
                 if packed_clean_audio_latents is not None:
                     audio_timestep = torch.zeros_like(timestep)
                         audio_num_frames=audio_num_frames,
                         video_coords=video_coords,
                         audio_coords=audio_coords,
+                        # rope_interpolation_scale=rope_interpolation_scale,
                         attention_kwargs=attention_kwargs,
                         return_dict=False,
                     )
                             noise_pred_audio, noise_pred_audio_text, guidance_rescale=self.guidance_rescale
                         )
+                bsz = noise_pred_video.size(0)
+                sigma = self.scheduler.sigmas[i]
+                denoised_sample = latents - noise_pred_video * sigma
+                denoised_sample_cond = (
+                    denoised_sample * (1 - conditioning_mask[:bsz]) + clean_latents.float() * conditioning_mask[:bsz]
+                ).to(noise_pred_video.dtype)
+                denoised_latents_cond = ((latents - denoised_sample_cond) / sigma).to(noise_pred_video.dtype)
+                latents = self.scheduler.step(denoised_latents_cond, t, latents, return_dict=False)[0]
+                # Only step audio latents if not conditioning on clean audio
                 if packed_clean_audio_latents is None:
                     audio_latents = audio_scheduler.step(noise_pred_audio, t, audio_latents, return_dict=False)[0]
                     for k in callback_on_step_end_tensor_inputs:
                         callback_kwargs[k] = locals()[k]
                     callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
                     latents = callback_outputs.pop("latents", latents)
                     prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
                 if XLA_AVAILABLE:
                     xm.mark_step()
         latents = self._unpack_latents(
             latents,
             latent_num_frames,
             latents, self.vae.latents_mean, self.vae.latents_std, self.vae.config.scaling_factor
         )
+        # Choose audio latents for decode: clean if provided, else denoised
+        if packed_clean_audio_latents is not None:
+            audio_latents_to_decode = packed_clean_audio_latents
         else:
+            audio_latents_to_decode = audio_latents
+        audio_latents_to_decode = self._denormalize_audio_latents(
+            audio_latents_to_decode, self.audio_vae.latents_mean, self.audio_vae.latents_std
+        )
+        audio_latents_to_decode = self._unpack_audio_latents(
+            audio_latents_to_decode, audio_num_frames, num_mel_bins=latent_mel_bins
+        )
         if output_type == "latent":
             video = latents
+            audio_out = audio_latents_to_decode
         else:
             latents = latents.to(prompt_embeds.dtype)
             video = self.vae.decode(latents, timestep, return_dict=False)[0]
             video = self.video_processor.postprocess_video(video, output_type=output_type)
+            audio_latents_to_decode = audio_latents_to_decode.to(self.audio_vae.dtype)
+            generated_mel_spectrograms = self.audio_vae.decode(audio_latents_to_decode, return_dict=False)[0]
+            audio_out = self.vocoder(generated_mel_spectrograms)
         self.maybe_free_model_hooks()
         if not return_dict:
+            return (video, audio_out)
+        return LTX2PipelineOutput(frames=video, audio=audio_out)