ltx-2-TURBO

Sleeping

App Files Files Community

alex commited on 20 days ago

Commit

17fc24b

1 Parent(s): b7c34d7

guide frame

Browse files

Files changed (2) hide show

app.py +69 -29
packages/ltx-pipelines/src/ltx_pipelines/distilled.py +50 -25

app.py CHANGED Viewed

@@ -1404,13 +1404,14 @@ class AudioDropUpload(gr.HTML):
-def generate_video_example(input_image, prompt, camera_lora, resolution, radioanimated_mode, input_video, input_audio, progress=gr.Progress(track_tqdm=True)):
     w, h = apply_resolution(resolution)
     with timer(f'generating with video path:{input_video} with duration:{duration} and LoRA:{camera_lora} in {w}x{h}'):
         output_video = generate_video(
-            input_image,
             prompt,
             10,
             input_video,
@@ -1427,7 +1428,8 @@ def generate_video_example(input_image, prompt, camera_lora, resolution, radioan
     return output_video
 def get_duration(
-    input_image,
     prompt,
     duration,
     input_video,
@@ -1457,11 +1459,11 @@ def get_duration(
         return 120 + extra_time
     else:
         return 180 + extra_time
 @spaces.GPU(duration=get_duration)
 def generate_video(
-    input_image,
     prompt: str,
     duration: float,
     input_video = None,
@@ -1478,7 +1480,8 @@ def generate_video(
     """
     Generate a short cinematic video from a text prompt and optional input image using the LTX-2 distilled pipeline.
     Args:
-        input_image: Optional input image for image-to-video. If provided, it is injected at frame 0 to guide motion.
         prompt: Text description of the scene, motion, and cinematic style to generate.
         duration: Desired video length in seconds. Converted to frames using a fixed 24 FPS rate.
         input_video: Optional conditioning video path (mp4). If provided, motion is guided by this video.
@@ -1534,7 +1537,7 @@ def generate_video(
                 target_fps=frame_rate,
             )
-            if input_image is None:
                 images = [(first_png, 0, 1.0)]
             if audio_path is None:
@@ -1554,13 +1557,19 @@ def generate_video(
             videos = [(cond_path, 1.0)]
             camera_lora = "Pose"
-    if input_image is not None:
-        images = [(input_image, 0, 1.0)]
     embeddings, final_prompt, status = encode_prompt(
         prompt=prompt,
         enhance_prompt=enhance_prompt,
-        input_image=input_image,
         seed=current_seed,
         negative_prompt="",
     )
@@ -1653,9 +1662,10 @@ def apply_duration(duration: str):
     return duration_s
 def on_mode_change(selected: str):
-    is_i2v = (selected == "Image-to-Video")
-    return gr.update(visible=not is_i2v)
@@ -2468,7 +2478,7 @@ def apply_example(idx: str):
     idx = int(idx)
     # Read the example row from your list
-    img, prompt_txt, cam, res, mode, vid, aud = examples_list[idx]
     img_path = img if img else None
     vid_path = vid if vid else None
@@ -2481,6 +2491,7 @@ def apply_example(idx: str):
     mode_update = mode
     video_update = gr.update(value=vid_path, visible=(mode == "Motion Control"))
     audio_update = aud_path
     return (
         input_image_update,
@@ -2491,6 +2502,7 @@ def apply_example(idx: str):
         video_update,
         audio_update,
         audio_update,
     )
@@ -2522,7 +2534,7 @@ with gr.Blocks(title="LTX-2 Video Distilled 🎥🔈") as demo:
     with gr.Column(elem_id="col-container"):
         with gr.Row(elem_id="mode-row"):
             radioanimated_mode = RadioAnimated(
-                choices=["Image-to-Video", "Motion Control"],
                 value="Image-to-Video",
                 elem_id="radioanimated_mode"
             )
@@ -2531,12 +2543,19 @@ with gr.Blocks(title="LTX-2 Video Distilled 🎥🔈") as demo:
                 with gr.Row():
-                    input_image = gr.Image(
                         label="First Frame (Optional)",
                         type="filepath",
                         height=256
                     )
                     input_video = gr.Video(
                         label="Motion Reference Video",
                         height=256,
@@ -2700,7 +2719,7 @@ with gr.Blocks(title="LTX-2 Video Distilled 🎥🔈") as demo:
     radioanimated_mode.change(
     fn=on_mode_change,
         inputs=radioanimated_mode,
-        outputs=[input_video],
         api_visibility="private",
     )
@@ -2728,7 +2747,8 @@ with gr.Blocks(title="LTX-2 Video Distilled 🎥🔈") as demo:
     generate_btn.click(
         fn=generate_video,
         inputs=[
-            input_image,
             prompt,
             duration,
             input_video,
@@ -2752,7 +2772,18 @@ with gr.Blocks(title="LTX-2 Video Distilled 🎥🔈") as demo:
             "16:9",
             "Image-to-Video",
             None,
-            "examples/supergirl.m4a"
         ],
         [
             "examples/supergirl.png",
@@ -2762,6 +2793,7 @@ with gr.Blocks(title="LTX-2 Video Distilled 🎥🔈") as demo:
             "Image-to-Video",
             None,
             None,
         ],
         [
             "examples/clay.png",
@@ -2771,6 +2803,7 @@ with gr.Blocks(title="LTX-2 Video Distilled 🎥🔈") as demo:
             "Motion Control",
             "examples/tiktok.mp4",
             None,
         ],
         [
             "examples/paint.png",
@@ -2780,6 +2813,7 @@ with gr.Blocks(title="LTX-2 Video Distilled 🎥🔈") as demo:
             "Motion Control",
             "examples/tiktok.mp4",
             None,
         ],
         [
             "examples/highland.png",
@@ -2789,6 +2823,7 @@ with gr.Blocks(title="LTX-2 Video Distilled 🎥🔈") as demo:
             "Image-to-Video",
             None,
             None,
         ],
         [
             "examples/wednesday.png",
@@ -2798,6 +2833,7 @@ with gr.Blocks(title="LTX-2 Video Distilled 🎥🔈") as demo:
             "Image-to-Video",
             None,
             None,
         ],
         [
             "examples/astronaut.png",
@@ -2807,13 +2843,14 @@ with gr.Blocks(title="LTX-2 Video Distilled 🎥🔈") as demo:
             "Image-to-Video",
             None,
             None,
         ],
     ]
     examples_obj = create_examples(
         examples=examples_list,
         fn=generate_video_example,
-        inputs=[input_image, prompt_ui, camera_ui, resolution_ui, radioanimated_mode, input_video, audio_input],
         outputs = [output_video],
         label="Examples",
         cache_examples=True,
@@ -2822,13 +2859,15 @@ with gr.Blocks(title="LTX-2 Video Distilled 🎥🔈") as demo:
     preset_gallery = PresetGallery(
         items=[
-            {"thumb": "examples/supergirl-2.png", "label": "Example 1", "title": "Image+Audio to Video" },
-            {"thumb": "examples/supergirl.png"  , "label": "Example 2", "title": "Image to Video" },
-            {"thumb": "examples/clay.png"       , "label": "Example 3", "title": "Pose to Video" },
-            {"thumb": "examples/paint.png"      , "label": "Example 4", "title": "Pose to Video" },
-            {"thumb": "examples/highland.png"   , "label": "Example 5", "title": "Image to Video" },
-            {"thumb": "examples/wednesday.png"  , "label": "Example 6", "title": "Image to Video" },
-            {"thumb": "examples/astronaut.png"  , "label": "Example 7", "title": "Image to Video" },
         ],
         title="Click on Our Examples",
     )
@@ -2858,7 +2897,7 @@ with gr.Blocks(title="LTX-2 Video Distilled 🎥🔈") as demo:
         fn=apply_example,
         inputs=preset_gallery,
         outputs=[
-            input_image,
             prompt_ui,
             camera_ui,
             resolution_ui,
@@ -2866,6 +2905,7 @@ with gr.Blocks(title="LTX-2 Video Distilled 🎥🔈") as demo:
             input_video,
             audio_input,
             audio_ui,
         ],
         api_visibility="private",
     ).then(

+def generate_video_example(first_frame, prompt, camera_lora, resolution, radioanimated_mode, input_video, input_audio, end_frame, progress=gr.Progress(track_tqdm=True)):
     w, h = apply_resolution(resolution)
     with timer(f'generating with video path:{input_video} with duration:{duration} and LoRA:{camera_lora} in {w}x{h}'):
         output_video = generate_video(
+            first_frame,
+            end_frame,
             prompt,
             10,
             input_video,
     return output_video
 def get_duration(
+    first_frame,
+    end_frame,
     prompt,
     duration,
     input_video,
         return 120 + extra_time
     else:
         return 180 + extra_time
 @spaces.GPU(duration=get_duration)
 def generate_video(
+    first_frame,
+    end_frame,
     prompt: str,
     duration: float,
     input_video = None,
     """
     Generate a short cinematic video from a text prompt and optional input image using the LTX-2 distilled pipeline.
     Args:
+        first_frame: Optional first frame for image-to-video. If provided, it is injected at frame 0 to guide motion.
+        end_frame: Optional last frame for image-to-video. If provided, it is injected at last frame to guide motion.
         prompt: Text description of the scene, motion, and cinematic style to generate.
         duration: Desired video length in seconds. Converted to frames using a fixed 24 FPS rate.
         input_video: Optional conditioning video path (mp4). If provided, motion is guided by this video.
                 target_fps=frame_rate,
             )
+            if first_frame is None:
                 images = [(first_png, 0, 1.0)]
             if audio_path is None:
             videos = [(cond_path, 1.0)]
             camera_lora = "Pose"
+    if first_frame is not None:
+        images = []
+        images.append((first_frame, 0, 1.0))
+    if generation_mode == "Interpolate":
+      if end_frame is not None:
+          end_idx = max(0, num_frames - 1)
+          images.append((end_frame, end_idx, 0.5))
     embeddings, final_prompt, status = encode_prompt(
         prompt=prompt,
         enhance_prompt=enhance_prompt,
+        input_image=first_frame,
         seed=current_seed,
         negative_prompt="",
     )
     return duration_s
 def on_mode_change(selected: str):
+    is_motion = (selected == "Motion Control")
+    is_interpolate = (selected == "Interpolate")
+    return (gr.update(visible=is_motion), gr.update(visible=is_interpolate))
     idx = int(idx)
     # Read the example row from your list
+    img, prompt_txt, cam, res, mode, vid, aud, end_img = examples_list[idx]
     img_path = img if img else None
     vid_path = vid if vid else None
     mode_update = mode
     video_update = gr.update(value=vid_path, visible=(mode == "Motion Control"))
     audio_update = aud_path
+    end_image = end_img
     return (
         input_image_update,
         video_update,
         audio_update,
         audio_update,
+        end_image,
     )
     with gr.Column(elem_id="col-container"):
         with gr.Row(elem_id="mode-row"):
             radioanimated_mode = RadioAnimated(
+                choices=["Image-to-Video", "Interpolate", "Motion Control"],
                 value="Image-to-Video",
                 elem_id="radioanimated_mode"
             )
                 with gr.Row():
+                    first_frame = gr.Image(
                         label="First Frame (Optional)",
                         type="filepath",
                         height=256
                     )
+                    end_frame = gr.Image(
+                        label="Last Frame (Optional)",
+                        type="filepath",
+                        height=256,
+                        visible=False,
+                    )
                     input_video = gr.Video(
                         label="Motion Reference Video",
                         height=256,
     radioanimated_mode.change(
     fn=on_mode_change,
         inputs=radioanimated_mode,
+        outputs=[input_video, end_frame],
         api_visibility="private",
     )
     generate_btn.click(
         fn=generate_video,
         inputs=[
+            first_frame,
+            end_frame,
             prompt,
             duration,
             input_video,
             "16:9",
             "Image-to-Video",
             None,
+            "examples/supergirl.m4a",
+            None,
+        ],
+        [
+            "examples/frame3.png",
+            "a woman in a white dress standing in a supermarket, looking at a stack of pomegranates, she picks one and takes a bite, the camera zooms in to a close up of the pomegranate seeds. A calm music is playing in the supermarket and you can hear her taking a bite.",
+            "Zoom In",
+            "16:9",
+            "Interpolate",
+            None,
+            None,
+            "examples/frame4.png",
         ],
         [
             "examples/supergirl.png",
             "Image-to-Video",
             None,
             None,
+            None,
         ],
         [
             "examples/clay.png",
             "Motion Control",
             "examples/tiktok.mp4",
             None,
+            None,
         ],
         [
             "examples/paint.png",
             "Motion Control",
             "examples/tiktok.mp4",
             None,
+            None,
         ],
         [
             "examples/highland.png",
             "Image-to-Video",
             None,
             None,
+            None,
         ],
         [
             "examples/wednesday.png",
             "Image-to-Video",
             None,
             None,
+            None,
         ],
         [
             "examples/astronaut.png",
             "Image-to-Video",
             None,
             None,
+            None,
         ],
     ]
     examples_obj = create_examples(
         examples=examples_list,
         fn=generate_video_example,
+        inputs=[first_frame, prompt_ui, camera_ui, resolution_ui, radioanimated_mode, input_video, audio_input, end_frame],
         outputs = [output_video],
         label="Examples",
         cache_examples=True,
     preset_gallery = PresetGallery(
         items=[
+            {"thumb": "examples/supergirl-2.png", "label": "Example 1", "title": "Image + Audio to Video" },
+            {"thumb": "examples/frame3.png"     , "label": "Example 2", "title": "First and Last Frame" },
+            {"thumb": "examples/supergirl.png"  , "label": "Example 3", "title": "Image to Video" },
+            {"thumb": "examples/clay.png"       , "label": "Example 4", "title": "Pose to Video" },
+            {"thumb": "examples/paint.png"      , "label": "Example 5", "title": "Pose to Video" },
+            {"thumb": "examples/highland.png"   , "label": "Example 6", "title": "Image to Video" },
+            {"thumb": "examples/wednesday.png"  , "label": "Example 7", "title": "Image to Video" },
+            {"thumb": "examples/astronaut.png"  , "label": "Example 8", "title": "Image to Video" },
         ],
         title="Click on Our Examples",
     )
         fn=apply_example,
         inputs=preset_gallery,
         outputs=[
+            first_frame,
             prompt_ui,
             camera_ui,
             resolution_ui,
             input_video,
             audio_input,
             audio_ui,
+            end_frame,
         ],
         api_visibility="private",
     ).then(

packages/ltx-pipelines/src/ltx_pipelines/distilled.py CHANGED Viewed

@@ -6,6 +6,7 @@ import torch
 from ltx_core.components.diffusion_steps import EulerDiffusionStep
 from ltx_core.loader.sd_ops import LTXV_LORA_COMFY_RENAMING_MAP
 from ltx_core.components.noisers import GaussianNoiser
 from ltx_core.components.protocols import DiffusionStepProtocol
 from ltx_core.conditioning import ConditioningItem, VideoConditionByKeyframeIndex, ConditioningError
@@ -34,6 +35,7 @@ from ltx_pipelines.utils.helpers import (
     generate_enhanced_prompt,
     get_device,
     image_conditionings_by_replacing_latent,
     simple_denoising_func,
 )
 from ltx_pipelines.utils.media_io import encode_video, load_video_conditioning
@@ -295,19 +297,8 @@ class DistilledPipeline:
                 strength=audio_strength,
             )
-        # Use pre-computed embeddings if provided, otherwise encode text
-        if video_context is None or audio_context is None:
-            text_encoder = self.model_ledger.text_encoder()
-            context_p = encode_text(text_encoder, prompts=[prompt])[0]
-            video_context, audio_context = context_p
-            torch.cuda.synchronize()
-            del text_encoder
-            utils.cleanup_memory()
-        else:
-            # Move pre-computed embeddings to device if needed
-            video_context = video_context.to(self.device)
-            audio_context = audio_context.to(self.device)
         # Stage 1: Initial low resolution video generation.
         # Load models only if not already cached
@@ -319,6 +310,7 @@ class DistilledPipeline:
             self._transformer = self.model_ledger.transformer()
         transformer = self._transformer
         stage_1_sigmas = torch.Tensor(DISTILLED_SIGMA_VALUES).to(self.device)
         def denoising_loop(
             sigmas: torch.Tensor, video_state: LatentState, audio_state: LatentState, stepper: DiffusionStepProtocol
@@ -335,7 +327,13 @@ class DistilledPipeline:
                 ),
             )
-        stage_1_output_shape = VideoPixelShape(batch=1, frames=num_frames, width=width, height=height, fps=frame_rate)
         stage_1_conditionings = self._create_conditionings(
             images=images,
             video_conditioning=video_conditioning,
@@ -370,8 +368,13 @@ class DistilledPipeline:
         stage_2_sigmas = torch.Tensor(STAGE_2_DISTILLED_SIGMA_VALUES).to(self.device)
         stage_2_output_shape = VideoPixelShape(
-            batch=1, frames=num_frames, width=width * 2, height=height * 2, fps=frame_rate
         )
         if apply_video_conditioning_to_stage2:
             stage_2_conditionings = self._create_conditionings(
                 images=images,
@@ -383,7 +386,7 @@ class DistilledPipeline:
                 video_conditioning_frame_idx=video_conditioning_frame_idx,
             )
         else:
-            stage_2_conditionings = image_conditionings_by_replacing_latent(
                 images=images,
                 height=stage_2_output_shape.height,
                 width=stage_2_output_shape.width,
@@ -437,15 +440,37 @@ class DistilledPipeline:
         video_conditioning_frame_idx: int,
         dtype: torch.dtype,
     ):
-        # 1) Keep ORIGINAL behavior: image conditioning by replacing latent
-        conditionings = image_conditionings_by_replacing_latent(
-            images=images,
-            height=height,
-            width=width,
-            video_encoder=video_encoder,
-            dtype=dtype,
-            device=self.device,
-        )
         # 2) Optional: add video conditioning (IC-LoRA style)
         if not video_conditioning:

 from ltx_core.components.diffusion_steps import EulerDiffusionStep
 from ltx_core.loader.sd_ops import LTXV_LORA_COMFY_RENAMING_MAP
+from ltx_core.components.schedulers import LTX2Scheduler
 from ltx_core.components.noisers import GaussianNoiser
 from ltx_core.components.protocols import DiffusionStepProtocol
 from ltx_core.conditioning import ConditioningItem, VideoConditionByKeyframeIndex, ConditioningError
     generate_enhanced_prompt,
     get_device,
     image_conditionings_by_replacing_latent,
+    image_conditionings_by_adding_guiding_latent,
     simple_denoising_func,
 )
 from ltx_pipelines.utils.media_io import encode_video, load_video_conditioning
                 strength=audio_strength,
             )
+        video_context = video_context.to(self.device)
+        audio_context = audio_context.to(self.device)
         # Stage 1: Initial low resolution video generation.
         # Load models only if not already cached
             self._transformer = self.model_ledger.transformer()
         transformer = self._transformer
         stage_1_sigmas = torch.Tensor(DISTILLED_SIGMA_VALUES).to(self.device)
+        # stage_1_sigmas = LTX2Scheduler().execute(steps=40).to(dtype=torch.float32, device=self.device)
         def denoising_loop(
             sigmas: torch.Tensor, video_state: LatentState, audio_state: LatentState, stepper: DiffusionStepProtocol
                 ),
             )
+        stage_1_output_shape = VideoPixelShape(
+            batch=1,
+            frames=num_frames,
+            width=width,
+            height=height,
+            fps=frame_rate,
+        )
         stage_1_conditionings = self._create_conditionings(
             images=images,
             video_conditioning=video_conditioning,
         stage_2_sigmas = torch.Tensor(STAGE_2_DISTILLED_SIGMA_VALUES).to(self.device)
         stage_2_output_shape = VideoPixelShape(
+            batch=1,
+            frames=num_frames,
+            width=width * 2,
+            height=height * 2,
+            fps=frame_rate,
         )
         if apply_video_conditioning_to_stage2:
             stage_2_conditionings = self._create_conditionings(
                 images=images,
                 video_conditioning_frame_idx=video_conditioning_frame_idx,
             )
         else:
+            stage_2_conditionings = image_conditionings_by_adding_guiding_latent(
                 images=images,
                 height=stage_2_output_shape.height,
                 width=stage_2_output_shape.width,
         video_conditioning_frame_idx: int,
         dtype: torch.dtype,
     ):
+        # First frame: replace-latent (strong anchor)
+        replace_imgs = []
+        # End frame: guiding-latent (avoid empty slice)
+        guide_imgs = []
+        for path, frame_idx, strength in images:
+            if frame_idx == 0:
+                replace_imgs.append((path, frame_idx, strength))
+            else:
+                guide_imgs.append((path, frame_idx, strength))
+        conditionings = []
+        if replace_imgs:
+            conditionings += image_conditionings_by_replacing_latent(
+                images=replace_imgs,
+                height=height,
+                width=width,
+                video_encoder=video_encoder,
+                dtype=dtype,
+                device=self.device,
+            )
+        if guide_imgs:
+            conditionings += image_conditionings_by_adding_guiding_latent(
+                images=guide_imgs,
+                height=height,
+                width=width,
+                video_encoder=video_encoder,
+                dtype=dtype,
+                device=self.device,
+            )
         # 2) Optional: add video conditioning (IC-LoRA style)
         if not video_conditioning: