LTX-2-3-sync

Paused

App Files Files Community

linoyts HF Staff commited on Mar 7

Commit

2e243c5

verified ·

1 Parent(s): 737cc06

Update app.py

Browse files

Files changed (1) hide show

app.py +196 -85

app.py CHANGED Viewed

@@ -42,9 +42,11 @@ import gradio as gr
 import numpy as np
 from huggingface_hub import hf_hub_download, snapshot_download
 from ltx_core.model.video_vae import TilingConfig, get_video_chunks_number
 from ltx_core.quantization import QuantizationPolicy
-from ltx_pipelines.distilled import DistilledPipeline
 from ltx_pipelines.utils.args import ImageConditioningInput
 from ltx_pipelines.utils.media_io import encode_video
@@ -61,12 +63,6 @@ except Exception as e:
 logging.getLogger().setLevel(logging.INFO)
 MAX_SEED = np.iinfo(np.int32).max
-DEFAULT_PROMPT = (
-    "An astronaut hatches from a fragile egg on the surface of the Moon, "
-    "the shell cracking and peeling apart in gentle low-gravity motion. "
-    "Fine lunar dust lifts and drifts outward with each movement, floating "
-    "in slow arcs before settling back onto the ground."
-)
 DEFAULT_FRAME_RATE = 24.0
 # Resolution presets: (width, height)
@@ -76,52 +72,104 @@ RESOLUTIONS = {
 }
 # Model repos
-LTX_MODEL_REPO = "Lightricks/LTX-2.3"
 GEMMA_REPO = "google/gemma-3-12b-it-qat-q4_0-unquantized"
 # Download model checkpoints
 print("=" * 80)
-print("Downloading LTX-2.3 distilled model + Gemma...")
 print("=" * 80)
 checkpoint_path = hf_hub_download(repo_id=LTX_MODEL_REPO, filename="ltx-2.3-22b-distilled.safetensors")
 spatial_upsampler_path = hf_hub_download(repo_id=LTX_MODEL_REPO, filename="ltx-2.3-spatial-upscaler-x2-1.0.safetensors")
 gemma_root = snapshot_download(repo_id=GEMMA_REPO)
 print(f"Checkpoint: {checkpoint_path}")
 print(f"Spatial upsampler: {spatial_upsampler_path}")
 print(f"Gemma root: {gemma_root}")
-# Initialize pipeline WITH text encoder
-pipeline = DistilledPipeline(
-    distilled_checkpoint_path=checkpoint_path,
-    spatial_upsampler_path=spatial_upsampler_path,
-    gemma_root=gemma_root,
-    loras=[],
-    quantization=QuantizationPolicy.fp8_cast(),
-)
-# Preload all models for ZeroGPU tensor packing.
-print("Preloading all models (including Gemma)...")
-ledger = pipeline.model_ledger
-_transformer = ledger.transformer()
-_video_encoder = ledger.video_encoder()
-_video_decoder = ledger.video_decoder()
-_audio_decoder = ledger.audio_decoder()
-_vocoder = ledger.vocoder()
-_spatial_upsampler = ledger.spatial_upsampler()
-_text_encoder = ledger.text_encoder()
-_embeddings_processor = ledger.gemma_embeddings_processor()
-ledger.transformer = lambda: _transformer
-ledger.video_encoder = lambda: _video_encoder
-ledger.video_decoder = lambda: _video_decoder
-ledger.audio_decoder = lambda: _audio_decoder
-ledger.vocoder = lambda: _vocoder
-ledger.spatial_upsampler = lambda: _spatial_upsampler
-ledger.text_encoder = lambda: _text_encoder
-ledger.gemma_embeddings_processor = lambda: _embeddings_processor
-print("All models preloaded (including Gemma text encoder)!")
 print("=" * 80)
 print("Pipeline ready!")
@@ -136,14 +184,14 @@ def log_memory(tag: str):
         print(f"[VRAM {tag}] allocated={allocated:.2f}GB peak={peak:.2f}GB free={free / 1024**3:.2f}GB total={total / 1024**3:.2f}GB")
-def detect_aspect_ratio(image) -> str:
-    """Detect the closest aspect ratio (16:9, 9:16, or 1:1) from an image."""
-    if image is None:
         return "16:9"
-    if hasattr(image, "size"):
-        w, h = image.size
-    elif hasattr(image, "shape"):
-        h, w = image.shape[:2]
     else:
         return "16:9"
     ratio = w / h
@@ -151,39 +199,55 @@ def detect_aspect_ratio(image) -> str:
     return min(candidates, key=lambda k: abs(ratio - candidates[k]))
-def on_image_upload(image, high_res):
-    """Auto-set resolution when image is uploaded."""
-    aspect = detect_aspect_ratio(image)
     tier = "high" if high_res else "low"
     w, h = RESOLUTIONS[tier][aspect]
     return gr.update(value=w), gr.update(value=h)
-def on_highres_toggle(image, high_res):
     """Update resolution when high-res toggle changes."""
-    aspect = detect_aspect_ratio(image)
     tier = "high" if high_res else "low"
     w, h = RESOLUTIONS[tier][aspect]
     return gr.update(value=w), gr.update(value=h)
-@spaces.GPU(duration=75)
 @torch.inference_mode()
 def generate_video(
-    input_image,
     prompt: str,
     duration: float,
-    enhance_prompt: bool = True,
-    seed: int = 42,
-    randomize_seed: bool = True,
-    height: int = 1024,
-    width: int = 1536,
     progress=gr.Progress(track_tqdm=True),
 ):
     try:
         torch.cuda.reset_peak_memory_stats()
         log_memory("start")
         current_seed = random.randint(0, MAX_SEED) if randomize_seed else int(seed)
         frame_rate = DEFAULT_FRAME_RATE
@@ -191,24 +255,42 @@ def generate_video(
         num_frames = ((num_frames - 1 + 7) // 8) * 8 + 1
         print(f"Generating: {height}x{width}, {num_frames} frames ({duration}s), seed={current_seed}")
         images = []
-        if input_image is not None:
-            output_dir = Path("outputs")
-            output_dir.mkdir(exist_ok=True)
-            temp_image_path = output_dir / f"temp_input_{current_seed}.jpg"
-            if hasattr(input_image, "save"):
-                input_image.save(temp_image_path)
             else:
-                temp_image_path = Path(input_image)
-            images = [ImageConditioningInput(path=str(temp_image_path), frame_idx=0, strength=1.0)]
         tiling_config = TilingConfig.default()
         video_chunks_number = get_video_chunks_number(num_frames, tiling_config)
         log_memory("before pipeline call")
-        video, audio = pipeline(
             prompt=prompt,
             seed=current_seed,
             height=int(height),
@@ -216,8 +298,11 @@ def generate_video(
             num_frames=num_frames,
             frame_rate=frame_rate,
             images=images,
             tiling_config=tiling_config,
             enhance_prompt=enhance_prompt,
         )
         log_memory("after pipeline call")
@@ -234,6 +319,8 @@ def generate_video(
         log_memory("after encode_video")
         return str(output_path), current_seed
     except Exception as e:
         import traceback
         log_memory("on error")
@@ -241,35 +328,55 @@ def generate_video(
         return None, current_seed
-with gr.Blocks(title="LTX-2.3 Distilled") as demo:
-    gr.Markdown("# LTX-2.3 Distilled (22B): Fast Audio-Video Generation")
     gr.Markdown(
-        "Fast and high quality video + audio generation"
         "[[model]](https://huggingface.co/Lightricks/LTX-2.3) "
         "[[code]](https://github.com/Lightricks/LTX-2)"
     )
     with gr.Row():
         with gr.Column():
-            input_image = gr.Image(label="Input Image (Optional)", type="pil")
             prompt = gr.Textbox(
                 label="Prompt",
-                info="for best results - make it as elaborate as possible",
-                value="Make this image come alive with cinematic motion, smooth animation",
                 lines=3,
-                placeholder="Describe the motion and animation you want...",
             )
             with gr.Row():
                 duration = gr.Slider(label="Duration (seconds)", minimum=1.0, maximum=10.0, value=3.0, step=0.1)
                 with gr.Column():
                     enhance_prompt = gr.Checkbox(label="Enhance Prompt", value=False)
                     high_res = gr.Checkbox(label="High Resolution", value=True)
             generate_btn = gr.Button("Generate Video", variant="primary", size="lg")
             with gr.Accordion("Advanced Settings", open=False):
-                seed = gr.Slider(label="Seed", minimum=0, maximum=MAX_SEED, value=10, step=1)
                 randomize_seed = gr.Checkbox(label="Randomize Seed", value=True)
                 with gr.Row():
                     width = gr.Number(label="Width", value=1536, precision=0)
@@ -278,24 +385,29 @@ with gr.Blocks(title="LTX-2.3 Distilled") as demo:
         with gr.Column():
             output_video = gr.Video(label="Generated Video", autoplay=True)
-    # Auto-detect aspect ratio from uploaded image and set resolution
-    input_image.change(
-        fn=on_image_upload,
-        inputs=[input_image, high_res],
         outputs=[width, height],
     )
-    # Update resolution when high-res toggle changes
     high_res.change(
         fn=on_highres_toggle,
-        inputs=[input_image, high_res],
         outputs=[width, height],
     )
     generate_btn.click(
         fn=generate_video,
         inputs=[
-            input_image, prompt, duration, enhance_prompt,
             seed, randomize_seed, height, width,
         ],
         outputs=[output_video, seed],
@@ -307,5 +419,4 @@ css = """
 """
 if __name__ == "__main__":
-    demo.launch(theme=gr.themes.Citrus(), css=css)

 import numpy as np
 from huggingface_hub import hf_hub_download, snapshot_download
+from ltx_core.loader import LoraPathStrengthAndSDOps
+from ltx_core.loader.sd_ops import LTXV_LORA_COMFY_RENAMING_MAP
 from ltx_core.model.video_vae import TilingConfig, get_video_chunks_number
 from ltx_core.quantization import QuantizationPolicy
+from ltx_pipelines.ic_lora import ICLoraPipeline
 from ltx_pipelines.utils.args import ImageConditioningInput
 from ltx_pipelines.utils.media_io import encode_video
 logging.getLogger().setLevel(logging.INFO)
 MAX_SEED = np.iinfo(np.int32).max
 DEFAULT_FRAME_RATE = 24.0
 # Resolution presets: (width, height)
 }
 # Model repos
+LTX_MODEL_REPO = "diffusers-internal-dev/ltx-23"
 GEMMA_REPO = "google/gemma-3-12b-it-qat-q4_0-unquantized"
+# Available IC-LoRAs for LTX-2.3 (22B)
+IC_LORA_OPTIONS = {
+    "Union Control (Depth + Canny)": {
+        "repo": "Lightricks/LTX-2.3-22b-IC-LoRA-Union-Control",
+        "filename": "ltx-2.3-22b-ic-lora-union-control-ref0.5.safetensors",
+    },
+    "Motion Track Control": {
+        "repo": "Lightricks/LTX-2.3-22b-IC-LoRA-Motion-Track-Control",
+        "filename": "ltx-2.3-22b-ic-lora-motion-track-control-ref0.5.safetensors",
+    },
+}
 # Download model checkpoints
 print("=" * 80)
+print("Downloading LTX-2.3 distilled model + Gemma + IC-LoRAs...")
 print("=" * 80)
 checkpoint_path = hf_hub_download(repo_id=LTX_MODEL_REPO, filename="ltx-2.3-22b-distilled.safetensors")
 spatial_upsampler_path = hf_hub_download(repo_id=LTX_MODEL_REPO, filename="ltx-2.3-spatial-upscaler-x2-1.0.safetensors")
 gemma_root = snapshot_download(repo_id=GEMMA_REPO)
+# Pre-download all IC-LoRA checkpoints
+ic_lora_paths = {}
+for name, info in IC_LORA_OPTIONS.items():
+    path = hf_hub_download(repo_id=info["repo"], filename=info["filename"])
+    ic_lora_paths[name] = path
+    print(f"IC-LoRA '{name}': {path}")
 print(f"Checkpoint: {checkpoint_path}")
 print(f"Spatial upsampler: {spatial_upsampler_path}")
 print(f"Gemma root: {gemma_root}")
+# Build initial pipeline with the first IC-LoRA
+default_lora_name = "Union Control (Depth + Canny)"
+default_lora_path = ic_lora_paths[default_lora_name]
+current_pipeline = None
+current_lora_name = None
+def build_pipeline(lora_name: str) -> ICLoraPipeline:
+    """Build an ICLoraPipeline with the given IC-LoRA."""
+    lora_path = ic_lora_paths[lora_name]
+    lora = LoraPathStrengthAndSDOps(
+        path=lora_path,
+        strength=1.0,
+        sd_ops=LTXV_LORA_COMFY_RENAMING_MAP,
+    )
+    pipe = ICLoraPipeline(
+        distilled_checkpoint_path=checkpoint_path,
+        spatial_upsampler_path=spatial_upsampler_path,
+        gemma_root=gemma_root,
+        loras=[lora],
+        quantization=QuantizationPolicy.fp8_cast(),
+    )
+    return pipe
+def preload_pipeline(pipe: ICLoraPipeline) -> None:
+    """Preload all models from both ledgers for ZeroGPU tensor packing."""
+    print("Preloading stage 1 models (with IC-LoRA)...")
+    s1 = pipe.stage_1_model_ledger
+    _s1_transformer = s1.transformer()
+    _s1_video_encoder = s1.video_encoder()
+    _s1_text_encoder = s1.text_encoder()
+    _s1_embeddings_processor = s1.gemma_embeddings_processor()
+    s1.transformer = lambda: _s1_transformer
+    s1.video_encoder = lambda: _s1_video_encoder
+    s1.text_encoder = lambda: _s1_text_encoder
+    s1.gemma_embeddings_processor = lambda: _s1_embeddings_processor
+    print("Preloading stage 2 models (without IC-LoRA)...")
+    s2 = pipe.stage_2_model_ledger
+    _s2_transformer = s2.transformer()
+    _s2_video_encoder = s2.video_encoder()
+    _s2_video_decoder = s2.video_decoder()
+    _s2_audio_decoder = s2.audio_decoder()
+    _s2_vocoder = s2.vocoder()
+    _s2_spatial_upsampler = s2.spatial_upsampler()
+    s2.transformer = lambda: _s2_transformer
+    s2.video_encoder = lambda: _s2_video_encoder
+    s2.video_decoder = lambda: _s2_video_decoder
+    s2.audio_decoder = lambda: _s2_audio_decoder
+    s2.vocoder = lambda: _s2_vocoder
+    s2.spatial_upsampler = lambda: _s2_spatial_upsampler
+    print("All models preloaded!")
+print(f"Building initial pipeline with IC-LoRA: {default_lora_name}")
+current_pipeline = build_pipeline(default_lora_name)
+current_lora_name = default_lora_name
+preload_pipeline(current_pipeline)
 print("=" * 80)
 print("Pipeline ready!")
         print(f"[VRAM {tag}] allocated={allocated:.2f}GB peak={peak:.2f}GB free={free / 1024**3:.2f}GB total={total / 1024**3:.2f}GB")
+def detect_aspect_ratio(media) -> str:
+    """Detect the closest aspect ratio from an image or video."""
+    if media is None:
         return "16:9"
+    if hasattr(media, "size"):
+        w, h = media.size
+    elif hasattr(media, "shape"):
+        h, w = media.shape[:2]
     else:
         return "16:9"
     ratio = w / h
     return min(candidates, key=lambda k: abs(ratio - candidates[k]))
+def on_media_upload(first_image, last_image, high_res):
+    """Auto-set resolution when media is uploaded."""
+    ref = first_image if first_image is not None else last_image
+    aspect = detect_aspect_ratio(ref)
     tier = "high" if high_res else "low"
     w, h = RESOLUTIONS[tier][aspect]
     return gr.update(value=w), gr.update(value=h)
+def on_highres_toggle(first_image, last_image, high_res):
     """Update resolution when high-res toggle changes."""
+    ref = first_image if first_image is not None else last_image
+    aspect = detect_aspect_ratio(ref)
     tier = "high" if high_res else "low"
     w, h = RESOLUTIONS[tier][aspect]
     return gr.update(value=w), gr.update(value=h)
+@spaces.GPU(duration=120)
 @torch.inference_mode()
 def generate_video(
+    first_image,
+    last_image,
+    conditioning_video,
     prompt: str,
     duration: float,
+    ic_lora_choice: str,
+    conditioning_strength: float,
+    enhance_prompt: bool,
+    skip_stage_2: bool,
+    seed: int,
+    randomize_seed: bool,
+    height: int,
+    width: int,
     progress=gr.Progress(track_tqdm=True),
 ):
+    global current_pipeline, current_lora_name
     try:
         torch.cuda.reset_peak_memory_stats()
         log_memory("start")
+        # Rebuild pipeline if IC-LoRA changed
+        if ic_lora_choice != current_lora_name:
+            print(f"Switching IC-LoRA: {current_lora_name} → {ic_lora_choice}")
+            current_pipeline = build_pipeline(ic_lora_choice)
+            current_lora_name = ic_lora_choice
+            preload_pipeline(current_pipeline)
         current_seed = random.randint(0, MAX_SEED) if randomize_seed else int(seed)
         frame_rate = DEFAULT_FRAME_RATE
         num_frames = ((num_frames - 1 + 7) // 8) * 8 + 1
         print(f"Generating: {height}x{width}, {num_frames} frames ({duration}s), seed={current_seed}")
+        print(f"IC-LoRA: {ic_lora_choice}, conditioning_strength: {conditioning_strength}")
+        output_dir = Path("outputs")
+        output_dir.mkdir(exist_ok=True)
+        # Build image conditionings (first / last frame)
         images = []
+        if first_image is not None:
+            temp_first_path = output_dir / f"temp_first_{current_seed}.jpg"
+            if hasattr(first_image, "save"):
+                first_image.save(temp_first_path)
             else:
+                temp_first_path = Path(first_image)
+            images.append(ImageConditioningInput(path=str(temp_first_path), frame_idx=0, strength=1.0))
+        if last_image is not None:
+            temp_last_path = output_dir / f"temp_last_{current_seed}.jpg"
+            if hasattr(last_image, "save"):
+                last_image.save(temp_last_path)
+            else:
+                temp_last_path = Path(last_image)
+            images.append(ImageConditioningInput(path=str(temp_last_path), frame_idx=num_frames - 1, strength=1.0))
+        # Build video conditioning for IC-LoRA (reference video)
+        video_conditioning = []
+        if conditioning_video is not None:
+            video_path = str(conditioning_video)
+            video_conditioning.append((video_path, conditioning_strength))
+            print(f"Video conditioning: {video_path} (strength={conditioning_strength})")
         tiling_config = TilingConfig.default()
         video_chunks_number = get_video_chunks_number(num_frames, tiling_config)
         log_memory("before pipeline call")
+        video, audio = current_pipeline(
             prompt=prompt,
             seed=current_seed,
             height=int(height),
             num_frames=num_frames,
             frame_rate=frame_rate,
             images=images,
+            video_conditioning=video_conditioning,
             tiling_config=tiling_config,
             enhance_prompt=enhance_prompt,
+            conditioning_attention_strength=1.0,
+            skip_stage_2=skip_stage_2,
         )
         log_memory("after pipeline call")
         log_memory("after encode_video")
         return str(output_path), current_seed
+    except gr.Error:
+        raise
     except Exception as e:
         import traceback
         log_memory("on error")
         return None, current_seed
+with gr.Blocks(title="LTX-2.3 IC-LoRA") as demo:
+    gr.Markdown("# LTX-2.3 IC-LoRA: Video-to-Video & Image-to-Video Control")
     gr.Markdown(
+        "Video-to-video transformations using IC-LoRA conditioning "
+        "(depth + canny union control, motion tracking). Upload a **conditioning video** "
+        "as the IC-LoRA reference signal, optionally pin first/last frame images, "
+        "and describe the desired output. "
         "[[model]](https://huggingface.co/Lightricks/LTX-2.3) "
         "[[code]](https://github.com/Lightricks/LTX-2)"
     )
     with gr.Row():
         with gr.Column():
+            conditioning_video = gr.Video(
+                label="Conditioning Video (IC-LoRA Reference)",
+                sources=["upload"],
+            )
+            with gr.Row():
+                first_image = gr.Image(label="First Frame (Optional)", type="pil")
+                last_image = gr.Image(label="Last Frame (Optional)", type="pil")
             prompt = gr.Textbox(
                 label="Prompt",
+                info="Describe the desired output — the IC-LoRA controls structure from the reference",
+                value="A cinematic scene with dramatic lighting and rich detail, smooth motion",
                 lines=3,
+                placeholder="Describe the video you want to generate...",
             )
             with gr.Row():
                 duration = gr.Slider(label="Duration (seconds)", minimum=1.0, maximum=10.0, value=3.0, step=0.1)
+                ic_lora_choice = gr.Dropdown(
+                    label="IC-LoRA",
+                    choices=list(IC_LORA_OPTIONS.keys()),
+                    value=default_lora_name,
+                )
+            with gr.Row():
+                conditioning_strength = gr.Slider(
+                    label="Conditioning Strength", minimum=0.1, maximum=1.0, value=1.0, step=0.05,
+                )
                 with gr.Column():
                     enhance_prompt = gr.Checkbox(label="Enhance Prompt", value=False)
                     high_res = gr.Checkbox(label="High Resolution", value=True)
+                    skip_stage_2 = gr.Checkbox(label="Skip Stage 2 (faster, half res)", value=False)
             generate_btn = gr.Button("Generate Video", variant="primary", size="lg")
             with gr.Accordion("Advanced Settings", open=False):
+                seed = gr.Slider(label="Seed", minimum=0, maximum=MAX_SEED, value=42, step=1)
                 randomize_seed = gr.Checkbox(label="Randomize Seed", value=True)
                 with gr.Row():
                     width = gr.Number(label="Width", value=1536, precision=0)
         with gr.Column():
             output_video = gr.Video(label="Generated Video", autoplay=True)
+    # Auto-detect aspect ratio from uploaded images
+    first_image.change(
+        fn=on_media_upload,
+        inputs=[first_image, last_image, high_res],
+        outputs=[width, height],
+    )
+    last_image.change(
+        fn=on_media_upload,
+        inputs=[first_image, last_image, high_res],
         outputs=[width, height],
     )
     high_res.change(
         fn=on_highres_toggle,
+        inputs=[first_image, last_image, high_res],
         outputs=[width, height],
     )
     generate_btn.click(
         fn=generate_video,
         inputs=[
+            first_image, last_image, conditioning_video,
+            prompt, duration, ic_lora_choice, conditioning_strength,
+            enhance_prompt, skip_stage_2,
             seed, randomize_seed, height, width,
         ],
         outputs=[output_video, seed],
 """
 if __name__ == "__main__":
+    demo.launch(theme=gr.themes.Citrus(), css=css)