LTX-2-3-First-Last-Frame

Running on Zero

App Files Files Community

linoyts HF Staff commited on Mar 9

Commit

b6775a7

verified ·

1 Parent(s): 85c5116

add audio input support

Browse files

Files changed (1) hide show

app.py +202 -26

app.py CHANGED Viewed

@@ -42,11 +42,25 @@ import gradio as gr
 import numpy as np
 from huggingface_hub import hf_hub_download, snapshot_download
-from ltx_core.model.video_vae import TilingConfig, get_video_chunks_number
 from ltx_core.quantization import QuantizationPolicy
 from ltx_pipelines.distilled import DistilledPipeline
 from ltx_pipelines.utils.args import ImageConditioningInput
-from ltx_pipelines.utils.media_io import encode_video
 # Force-patch xformers attention into the LTX attention module.
 from ltx_core.model.transformer import attention as _attn_mod
@@ -75,6 +89,169 @@ RESOLUTIONS = {
     "low": {"16:9": (768, 512), "9:16": (512, 768), "1:1": (768, 768)},
 }
 # Model repos
 LTX_MODEL_REPO = "diffusers-internal-dev/ltx-23"
 GEMMA_REPO = "google/gemma-3-12b-it-qat-q4_0-unquantized"
@@ -92,8 +269,8 @@ print(f"Checkpoint: {checkpoint_path}")
 print(f"Spatial upsampler: {spatial_upsampler_path}")
 print(f"Gemma root: {gemma_root}")
-# Initialize pipeline WITH text encoder
-pipeline = DistilledPipeline(
     distilled_checkpoint_path=checkpoint_path,
     spatial_upsampler_path=spatial_upsampler_path,
     gemma_root=gemma_root,
@@ -102,11 +279,12 @@ pipeline = DistilledPipeline(
 )
 # Preload all models for ZeroGPU tensor packing.
-print("Preloading all models (including Gemma)...")
 ledger = pipeline.model_ledger
 _transformer = ledger.transformer()
 _video_encoder = ledger.video_encoder()
 _video_decoder = ledger.video_decoder()
 _audio_decoder = ledger.audio_decoder()
 _vocoder = ledger.vocoder()
 _spatial_upsampler = ledger.spatial_upsampler()
@@ -116,12 +294,13 @@ _embeddings_processor = ledger.gemma_embeddings_processor()
 ledger.transformer = lambda: _transformer
 ledger.video_encoder = lambda: _video_encoder
 ledger.video_decoder = lambda: _video_decoder
 ledger.audio_decoder = lambda: _audio_decoder
 ledger.vocoder = lambda: _vocoder
 ledger.spatial_upsampler = lambda: _spatial_upsampler
 ledger.text_encoder = lambda: _text_encoder
 ledger.gemma_embeddings_processor = lambda: _embeddings_processor
-print("All models preloaded (including Gemma text encoder)!")
 print("=" * 80)
 print("Pipeline ready!")
@@ -137,7 +316,6 @@ def log_memory(tag: str):
 def detect_aspect_ratio(image) -> str:
-    """Detect the closest aspect ratio (16:9, 9:16, or 1:1) from an image."""
     if image is None:
         return "16:9"
     if hasattr(image, "size"):
@@ -152,8 +330,6 @@ def detect_aspect_ratio(image) -> str:
 def on_image_upload(first_image, last_image, high_res):
-    """Auto-set resolution when an image is uploaded."""
-    # Use first image for aspect ratio detection, fall back to last image
     ref_image = first_image if first_image is not None else last_image
     aspect = detect_aspect_ratio(ref_image)
     tier = "high" if high_res else "low"
@@ -162,7 +338,6 @@ def on_image_upload(first_image, last_image, high_res):
 def on_highres_toggle(first_image, last_image, high_res):
-    """Update resolution when high-res toggle changes."""
     ref_image = first_image if first_image is not None else last_image
     aspect = detect_aspect_ratio(ref_image)
     tier = "high" if high_res else "low"
@@ -175,6 +350,7 @@ def on_highres_toggle(first_image, last_image, high_res):
 def generate_video(
     first_image,
     last_image,
     prompt: str,
     duration: float,
     enhance_prompt: bool = True,
@@ -229,6 +405,7 @@ def generate_video(
             num_frames=num_frames,
             frame_rate=frame_rate,
             images=images,
             tiling_config=tiling_config,
             enhance_prompt=enhance_prompt,
         )
@@ -257,7 +434,7 @@ def generate_video(
 with gr.Blocks(title="LTX-2.3 Distilled") as demo:
     gr.Markdown("# LTX-2.3 F2LF: Fast Audio-Video Generation with Frame Conditioning")
     gr.Markdown(
-        "Fast and high quality video + audio generation with first and last frame conditioing "
         "[[model]](https://huggingface.co/Lightricks/LTX-2.3) "
         "[[code]](https://github.com/Lightricks/LTX-2)"
     )
@@ -267,6 +444,7 @@ with gr.Blocks(title="LTX-2.3 Distilled") as demo:
             with gr.Row():
                 first_image = gr.Image(label="First Frame (Optional)", type="pil")
                 last_image = gr.Image(label="Last Frame (Optional)", type="pil")
             prompt = gr.Textbox(
                 label="Prompt",
                 info="for best results - make it as elaborate as possible",
@@ -274,7 +452,7 @@ with gr.Blocks(title="LTX-2.3 Distilled") as demo:
                 lines=3,
                 placeholder="Describe the motion and animation you want...",
             )
             with gr.Row():
                 duration = gr.Slider(label="Duration (seconds)", minimum=1.0, maximum=10.0, value=3.0, step=0.1)
                 with gr.Column():
@@ -292,34 +470,33 @@ with gr.Blocks(title="LTX-2.3 Distilled") as demo:
         with gr.Column():
             output_video = gr.Video(label="Generated Video", autoplay=True)
     gr.Examples(
         examples=[
             [
-                None,  # first_image
-                "pinkknit.jpg",           # last_image
                 "The camera falls downward through darkness as if dropped into a tunnel. "
                 "As it slows, five friends wearing pink knitted hats and sunglasses lean "
                 "over and look down toward the camera with curious expressions. The lens "
                 "has a strong fisheye effect, creating a circular frame around them. They "
                 "crowd together closely, forming a symmetrical cluster while staring "
                 "directly into the lens.",
-                3.0,   # duration
-                False, # enhance_prompt
-                42,    # seed
-                True, # randomize_seed
                 1024,
-                1024
             ],
         ],
         inputs=[
-            first_image, last_image, prompt, duration,
             enhance_prompt, seed, randomize_seed, height, width,
         ],
     )
-    # Auto-detect aspect ratio from uploaded image and set resolution
     first_image.change(
         fn=on_image_upload,
         inputs=[first_image, last_image, high_res],
@@ -332,7 +509,6 @@ with gr.Blocks(title="LTX-2.3 Distilled") as demo:
         outputs=[width, height],
     )
-    # Update resolution when high-res toggle changes
     high_res.change(
         fn=on_highres_toggle,
         inputs=[first_image, last_image, high_res],
@@ -342,7 +518,7 @@ with gr.Blocks(title="LTX-2.3 Distilled") as demo:
     generate_btn.click(
         fn=generate_video,
         inputs=[
-            first_image, last_image, prompt, duration, enhance_prompt,
             seed, randomize_seed, height, width,
         ],
         outputs=[output_video, seed],
@@ -354,4 +530,4 @@ css = """
 """
 if __name__ == "__main__":
-    demo.launch(theme=gr.themes.Citrus(), css=css)

 import numpy as np
 from huggingface_hub import hf_hub_download, snapshot_download
+from ltx_core.components.diffusion_steps import EulerDiffusionStep
+from ltx_core.components.noisers import GaussianNoiser
+from ltx_core.model.audio_vae import encode_audio as vae_encode_audio
+from ltx_core.model.upsampler import upsample_video
+from ltx_core.model.video_vae import TilingConfig, get_video_chunks_number, decode_video as vae_decode_video
 from ltx_core.quantization import QuantizationPolicy
+from ltx_core.types import Audio, AudioLatentShape, VideoPixelShape
 from ltx_pipelines.distilled import DistilledPipeline
+from ltx_pipelines.utils import euler_denoising_loop
 from ltx_pipelines.utils.args import ImageConditioningInput
+from ltx_pipelines.utils.constants import DISTILLED_SIGMA_VALUES, STAGE_2_DISTILLED_SIGMA_VALUES
+from ltx_pipelines.utils.helpers import (
+    cleanup_memory,
+    combined_image_conditionings,
+    denoise_video_only,
+    encode_prompts,
+    simple_denoising_func,
+)
+from ltx_pipelines.utils.media_io import decode_audio_from_file, encode_video
 # Force-patch xformers attention into the LTX attention module.
 from ltx_core.model.transformer import attention as _attn_mod
     "low": {"16:9": (768, 512), "9:16": (512, 768), "1:1": (768, 768)},
 }
+class LTX23DistilledA2VPipeline(DistilledPipeline):
+    """DistilledPipeline with optional audio conditioning."""
+    def __call__(
+        self,
+        prompt: str,
+        seed: int,
+        height: int,
+        width: int,
+        num_frames: int,
+        frame_rate: float,
+        images: list[ImageConditioningInput],
+        audio_path: str | None = None,
+        tiling_config: TilingConfig | None = None,
+        enhance_prompt: bool = False,
+    ):
+        # Standard path when no audio input is provided.
+        if audio_path is None:
+            return super().__call__(
+                prompt=prompt,
+                seed=seed,
+                height=height,
+                width=width,
+                num_frames=num_frames,
+                frame_rate=frame_rate,
+                images=images,
+                tiling_config=tiling_config,
+                enhance_prompt=enhance_prompt,
+            )
+        generator = torch.Generator(device=self.device).manual_seed(seed)
+        noiser = GaussianNoiser(generator=generator)
+        stepper = EulerDiffusionStep()
+        dtype = torch.bfloat16
+        (ctx_p,) = encode_prompts(
+            [prompt],
+            self.model_ledger,
+            enhance_first_prompt=enhance_prompt,
+            enhance_prompt_image=images[0].path if len(images) > 0 else None,
+        )
+        video_context, audio_context = ctx_p.video_encoding, ctx_p.audio_encoding
+        video_duration = num_frames / frame_rate
+        decoded_audio = decode_audio_from_file(audio_path, self.device, 0.0, video_duration)
+        if decoded_audio is None:
+            raise ValueError(f"Could not extract audio stream from {audio_path}")
+        encoded_audio_latent = vae_encode_audio(decoded_audio, self.model_ledger.audio_encoder())
+        audio_shape = AudioLatentShape.from_duration(batch=1, duration=video_duration, channels=8, mel_bins=16)
+        expected_frames = audio_shape.frames
+        actual_frames = encoded_audio_latent.shape[2]
+        if actual_frames > expected_frames:
+            encoded_audio_latent = encoded_audio_latent[:, :, :expected_frames, :]
+        elif actual_frames < expected_frames:
+            pad = torch.zeros(
+                encoded_audio_latent.shape[0],
+                encoded_audio_latent.shape[1],
+                expected_frames - actual_frames,
+                encoded_audio_latent.shape[3],
+                device=encoded_audio_latent.device,
+                dtype=encoded_audio_latent.dtype,
+            )
+            encoded_audio_latent = torch.cat([encoded_audio_latent, pad], dim=2)
+        video_encoder = self.model_ledger.video_encoder()
+        transformer = self.model_ledger.transformer()
+        stage_1_sigmas = torch.tensor(DISTILLED_SIGMA_VALUES, device=self.device)
+        def denoising_loop(sigmas, video_state, audio_state, stepper):
+            return euler_denoising_loop(
+                sigmas=sigmas,
+                video_state=video_state,
+                audio_state=audio_state,
+                stepper=stepper,
+                denoise_fn=simple_denoising_func(
+                    video_context=video_context,
+                    audio_context=audio_context,
+                    transformer=transformer,
+                ),
+            )
+        stage_1_output_shape = VideoPixelShape(
+            batch=1,
+            frames=num_frames,
+            width=width // 2,
+            height=height // 2,
+            fps=frame_rate,
+        )
+        stage_1_conditionings = combined_image_conditionings(
+            images=images,
+            height=stage_1_output_shape.height,
+            width=stage_1_output_shape.width,
+            video_encoder=video_encoder,
+            dtype=dtype,
+            device=self.device,
+        )
+        video_state = denoise_video_only(
+            output_shape=stage_1_output_shape,
+            conditionings=stage_1_conditionings,
+            noiser=noiser,
+            sigmas=stage_1_sigmas,
+            stepper=stepper,
+            denoising_loop_fn=denoising_loop,
+            components=self.pipeline_components,
+            dtype=dtype,
+            device=self.device,
+            initial_audio_latent=encoded_audio_latent,
+        )
+        torch.cuda.synchronize()
+        cleanup_memory()
+        upscaled_video_latent = upsample_video(
+            latent=video_state.latent[:1],
+            video_encoder=video_encoder,
+            upsampler=self.model_ledger.spatial_upsampler(),
+        )
+        stage_2_sigmas = torch.tensor(STAGE_2_DISTILLED_SIGMA_VALUES, device=self.device)
+        stage_2_output_shape = VideoPixelShape(batch=1, frames=num_frames, width=width, height=height, fps=frame_rate)
+        stage_2_conditionings = combined_image_conditionings(
+            images=images,
+            height=stage_2_output_shape.height,
+            width=stage_2_output_shape.width,
+            video_encoder=video_encoder,
+            dtype=dtype,
+            device=self.device,
+        )
+        video_state = denoise_video_only(
+            output_shape=stage_2_output_shape,
+            conditionings=stage_2_conditionings,
+            noiser=noiser,
+            sigmas=stage_2_sigmas,
+            stepper=stepper,
+            denoising_loop_fn=denoising_loop,
+            components=self.pipeline_components,
+            dtype=dtype,
+            device=self.device,
+            noise_scale=stage_2_sigmas[0],
+            initial_video_latent=upscaled_video_latent,
+            initial_audio_latent=encoded_audio_latent,
+        )
+        torch.cuda.synchronize()
+        del transformer
+        del video_encoder
+        cleanup_memory()
+        decoded_video = vae_decode_video(
+            video_state.latent,
+            self.model_ledger.video_decoder(),
+            tiling_config,
+            generator,
+        )
+        original_audio = Audio(
+            waveform=decoded_audio.waveform.squeeze(0),
+            sampling_rate=decoded_audio.sampling_rate,
+        )
+        return decoded_video, original_audio
 # Model repos
 LTX_MODEL_REPO = "diffusers-internal-dev/ltx-23"
 GEMMA_REPO = "google/gemma-3-12b-it-qat-q4_0-unquantized"
 print(f"Spatial upsampler: {spatial_upsampler_path}")
 print(f"Gemma root: {gemma_root}")
+# Initialize pipeline WITH text encoder and optional audio support
+pipeline = LTX23DistilledA2VPipeline(
     distilled_checkpoint_path=checkpoint_path,
     spatial_upsampler_path=spatial_upsampler_path,
     gemma_root=gemma_root,
 )
 # Preload all models for ZeroGPU tensor packing.
+print("Preloading all models (including Gemma and audio components)...")
 ledger = pipeline.model_ledger
 _transformer = ledger.transformer()
 _video_encoder = ledger.video_encoder()
 _video_decoder = ledger.video_decoder()
+_audio_encoder = ledger.audio_encoder()
 _audio_decoder = ledger.audio_decoder()
 _vocoder = ledger.vocoder()
 _spatial_upsampler = ledger.spatial_upsampler()
 ledger.transformer = lambda: _transformer
 ledger.video_encoder = lambda: _video_encoder
 ledger.video_decoder = lambda: _video_decoder
+ledger.audio_encoder = lambda: _audio_encoder
 ledger.audio_decoder = lambda: _audio_decoder
 ledger.vocoder = lambda: _vocoder
 ledger.spatial_upsampler = lambda: _spatial_upsampler
 ledger.text_encoder = lambda: _text_encoder
 ledger.gemma_embeddings_processor = lambda: _embeddings_processor
+print("All models preloaded (including Gemma text encoder and audio encoder)!")
 print("=" * 80)
 print("Pipeline ready!")
 def detect_aspect_ratio(image) -> str:
     if image is None:
         return "16:9"
     if hasattr(image, "size"):
 def on_image_upload(first_image, last_image, high_res):
     ref_image = first_image if first_image is not None else last_image
     aspect = detect_aspect_ratio(ref_image)
     tier = "high" if high_res else "low"
 def on_highres_toggle(first_image, last_image, high_res):
     ref_image = first_image if first_image is not None else last_image
     aspect = detect_aspect_ratio(ref_image)
     tier = "high" if high_res else "low"
 def generate_video(
     first_image,
     last_image,
+    input_audio,
     prompt: str,
     duration: float,
     enhance_prompt: bool = True,
             num_frames=num_frames,
             frame_rate=frame_rate,
             images=images,
+            audio_path=input_audio,
             tiling_config=tiling_config,
             enhance_prompt=enhance_prompt,
         )
 with gr.Blocks(title="LTX-2.3 Distilled") as demo:
     gr.Markdown("# LTX-2.3 F2LF: Fast Audio-Video Generation with Frame Conditioning")
     gr.Markdown(
+        "Fast and high quality video + audio generation with first and last frame conditioning and optional audio input "
         "[[model]](https://huggingface.co/Lightricks/LTX-2.3) "
         "[[code]](https://github.com/Lightricks/LTX-2)"
     )
             with gr.Row():
                 first_image = gr.Image(label="First Frame (Optional)", type="pil")
                 last_image = gr.Image(label="Last Frame (Optional)", type="pil")
+            input_audio = gr.Audio(label="Audio Input (Optional)", type="filepath")
             prompt = gr.Textbox(
                 label="Prompt",
                 info="for best results - make it as elaborate as possible",
                 lines=3,
                 placeholder="Describe the motion and animation you want...",
             )
             with gr.Row():
                 duration = gr.Slider(label="Duration (seconds)", minimum=1.0, maximum=10.0, value=3.0, step=0.1)
                 with gr.Column():
         with gr.Column():
             output_video = gr.Video(label="Generated Video", autoplay=True)
     gr.Examples(
         examples=[
             [
+                None,
+                "pinkknit.jpg",
+                None,
                 "The camera falls downward through darkness as if dropped into a tunnel. "
                 "As it slows, five friends wearing pink knitted hats and sunglasses lean "
                 "over and look down toward the camera with curious expressions. The lens "
                 "has a strong fisheye effect, creating a circular frame around them. They "
                 "crowd together closely, forming a symmetrical cluster while staring "
                 "directly into the lens.",
+                3.0,
+                False,
+                42,
+                True,
+                1024,
                 1024,
             ],
         ],
         inputs=[
+            first_image, last_image, input_audio, prompt, duration,
             enhance_prompt, seed, randomize_seed, height, width,
         ],
     )
     first_image.change(
         fn=on_image_upload,
         inputs=[first_image, last_image, high_res],
         outputs=[width, height],
     )
     high_res.change(
         fn=on_highres_toggle,
         inputs=[first_image, last_image, high_res],
     generate_btn.click(
         fn=generate_video,
         inputs=[
+            first_image, last_image, input_audio, prompt, duration, enhance_prompt,
             seed, randomize_seed, height, width,
         ],
         outputs=[output_video, seed],
 """
 if __name__ == "__main__":
+    demo.launch(theme=gr.themes.Citrus(), css=css)