LTX-2-3-hdr

Running on Zero

App Files Files Community

linoyts HF Staff commited on Apr 22

Commit

b68e12e

verified ·

1 Parent(s): e5ef795

Revert to shipped scene-emb .pt; pre-warm pipeline blocks for ZeroGPU

Browse files

Two changes:

1. Drop Gemma from inference. Re-download the shipped `comfyui_models_loras_ltxv_ltx2_ltx-2.3-22b-ic-lora-hdr-scene-emb.pt` from diffusers-internal-dev/LTX-HDR-LoRA and hand it directly to HDRICLoraPipeline. Removes the prompt textbox and the per-call ~20s Gemma 12B load+encode. Matches the HDR IC-LoRA's training-time scene embedding.

2. Pre-warm the pipeline at module load. Build the fp8-cast LoRA-fused transformer once (shared between stage_1 and stage_2), plus ImageConditioner encoder, VideoUpsampler (encoder + upsampler), and VideoDecoder. Replace the pipeline's blocks with cached wrappers that reuse the built models without the gpu_model() meta-device free on exit. Avoids re-reading the 22B checkpoint + LoRA fusion + fp8 cast on every @spaces.GPU call.

Tradeoffs: startup takes longer (one build of every component), but each subsequent generation skips the ~30-60s rebuild.

Files changed (1) hide show

app.py +95 -72

app.py CHANGED Viewed

@@ -42,13 +42,8 @@ if _tv.returncode == 0:
     )
 # ─────────────────────────────────────────────────────────────────────────────
-# ltx-core / ltx-pipelines source
-#
-# The HDRICLoraPipeline and its supporting modules (ltx_core.hdr,
-# ltx_pipelines.utils.blocks, load_video_conditioning_hdr, apply_hdr_decode_postprocess,
-# save_exr_tensor, encode_exr_sequence_to_mp4) are NOT on the public main
-# branch at the pinned commit used by the outpaint app. We install from the
-# local ltx-2-internal checkout so the HDR code path actually exists.
 # ─────────────────────────────────────────────────────────────────────────────
 LTX_INTERNAL = Path(os.environ.get(
     "LTX_INTERNAL_PATH",
@@ -72,6 +67,8 @@ import logging
 import random
 import tempfile
 import zipfile
 import torch
 torch._dynamo.config.suppress_errors = True
@@ -80,12 +77,12 @@ torch._dynamo.config.disable = True
 import spaces
 import gradio as gr
 import numpy as np
-from huggingface_hub import hf_hub_download, snapshot_download
 from ltx_core.model.video_vae import TilingConfig
 from ltx_core.quantization import QuantizationPolicy
 from ltx_pipelines.hdr_ic_lora import HDRICLoraPipeline, _make_tiling_config
-from ltx_pipelines.utils.blocks import PromptEncoder
 from ltx_pipelines.utils.media_io import (
     encode_exr_sequence_to_mp4,
     get_videostream_metadata,
@@ -93,7 +90,7 @@ from ltx_pipelines.utils.media_io import (
 )
 from ltx_pipelines.utils.types import OffloadMode
-# xformers attention patch (same as the outpaint app).
 from ltx_core.model.transformer import attention as _attn_mod
 print(f"[ATTN] Before patch: memory_efficient_attention={_attn_mod.memory_efficient_attention}")
 try:
@@ -111,7 +108,7 @@ logging.getLogger().setLevel(logging.INFO)
 # ─────────────────────────────────────────────────────────────────────────────
 MAX_SEED = np.iinfo(np.int32).max
-# Frames must satisfy (n-1) % 8 == 0. Aspect-ratio canvas sizes (divisible by 32).
 RESOLUTIONS = {
     "low":  {"16:9": (768, 512), "9:16": (512, 768), "1:1": (768, 768),
              "4:3": (768, 576), "3:4": (576, 768), "21:9": (768, 384)},
@@ -122,80 +119,120 @@ RESOLUTIONS = {
 LTX_MODEL_REPO = "Lightricks/LTX-2.3"
 DISTILLED_CHECKPOINT = "ltx-2.3-22b-distilled-1.1.safetensors"
 SPATIAL_UPSCALER = "ltx-2.3-spatial-upscaler-x2-1.1.safetensors"
-GEMMA_REPO = "google/gemma-3-12b-it-qat-q4_0-unquantized"
 HDR_LORA_REPO = "diffusers-internal-dev/LTX-HDR-LoRA"
 HDR_LORA_FILENAME = "comfyui_models_loras_ltxv_ltx2_ltx-2.3-22b-ic-lora-hdr-0.9 (4).safetensors"
 print("=" * 80)
-print("Downloading LTX-2.3 distilled + spatial upsampler + Gemma + HDR IC-LoRA...")
 print("=" * 80)
 checkpoint_path = hf_hub_download(repo_id=LTX_MODEL_REPO, filename=DISTILLED_CHECKPOINT)
 spatial_upsampler_path = hf_hub_download(repo_id=LTX_MODEL_REPO, filename=SPATIAL_UPSCALER)
 hdr_lora_path = hf_hub_download(repo_id=HDR_LORA_REPO, filename=HDR_LORA_FILENAME)
-gemma_root = snapshot_download(repo_id=GEMMA_REPO)
 print(f"Checkpoint: {checkpoint_path}")
 print(f"Spatial upsampler: {spatial_upsampler_path}")
 print(f"HDR IC-LoRA: {hdr_lora_path}")
-print(f"Gemma root: {gemma_root}")
 # ─────────────────────────────────────────────────────────────────────────────
-# Text encoding: on-the-fly Gemma -> (video_context, audio_context) for each
-# prompt. HDRICLoraPipeline expects a `.pt` path at __init__, so we bootstrap
-# one from an empty prompt, then overwrite `pipeline.text_embeddings` in
-# memory each generate call.
 # ─────────────────────────────────────────────────────────────────────────────
 _DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 _DTYPE = torch.bfloat16
-prompt_encoder = PromptEncoder(
-    checkpoint_path=checkpoint_path,
-    gemma_root=gemma_root,
-    dtype=_DTYPE,
-    device=_DEVICE,
-)
-def encode_prompt_to_contexts(prompt: str) -> tuple[torch.Tensor, torch.Tensor]:
-    """Run Gemma + embeddings processor to produce (video_context, audio_context).
-    HDRICLoraPipeline only consumes video_context; audio_context is stored for
-    shape-compat with the `.pt` interface but ignored during HDR generation.
-    MUST be called from inside a @spaces.GPU context on ZeroGPU.
-    """
-    (out,) = prompt_encoder([prompt])
-    v = out.video_encoding
-    a = out.audio_encoding if out.audio_encoding is not None else torch.zeros(0, device=v.device, dtype=v.dtype)
-    return v, a
-# HDRICLoraPipeline.__init__ requires a .pt it can torch.load, but it only
-# stores the tensors — __call__ reads `self.text_embeddings` which we overwrite
-# on every generate run. So write a placeholder .pt at module-load (CPU, no
-# Gemma run — Gemma can only touch GPU inside a @spaces.GPU function on ZeroGPU).
-_bootstrap_emb_path = Path(tempfile.gettempdir()) / "ltx_hdr_bootstrap_emb.pt"
-_placeholder = torch.zeros(1, 1, 4096, dtype=_DTYPE)
-torch.save({"video_context": _placeholder, "audio_context": _placeholder}, _bootstrap_emb_path)
-# ─────────────────────────────────────────────────────────────────────────────
-# Initialize pipeline
-# ─────────────────────────────────────────────────────────────────────────────
-# HDRICLoraPipeline is video-only (no audio path). HDR transform (LogC3) and
-# reference_downscale_factor are auto-detected from the LoRA metadata.
 pipeline = HDRICLoraPipeline(
     distilled_checkpoint_path=checkpoint_path,
     spatial_upsampler_path=spatial_upsampler_path,
     hdr_lora=hdr_lora_path,
-    text_embeddings_path=str(_bootstrap_emb_path),
     quantization=QuantizationPolicy.fp8_cast(),
     offload_mode=OffloadMode.NONE,
 )
 print(f"HDRICLoraPipeline ready. HDR transform: {pipeline.hdr_transform}, "
       f"ref_downscale={pipeline.reference_downscale_factor}")
 print("=" * 80)
@@ -241,7 +278,6 @@ def on_video_upload(video):
 @torch.inference_mode()
 def generate_video(
     input_video,
-    prompt: str,
     duration: float,
     frame_rate: float,
     target_aspect: str,
@@ -270,13 +306,7 @@ def generate_video(
         print(f"[HDR] {target_h}x{target_w}, frames={num_frames}, fps={frame_rate}, "
               f"seed={current_seed}, aspect={target_aspect}, hq_hdr={high_quality_hdr}")
-        # Encode prompt -> (video_context, audio_context) and swap into the
-        # pipeline. Gemma is loaded, used, and freed inside prompt_encoder.
-        print(f"[HDR] Encoding prompt: {prompt!r}")
-        video_context, audio_context = encode_prompt_to_contexts(prompt or "")
-        pipeline.text_embeddings = (video_context, audio_context)
-        # Tiling config: smaller spatial tile on lower-VRAM targets
         tiling_config = _make_tiling_config(spatial_tile=768 if not high_res else 1280)
         hdr_video = pipeline(
@@ -341,20 +371,13 @@ with gr.Blocks(title="LTX 2.3 HDR", css=css, theme=theme) as demo:
     gr.Markdown("""
 # LTX 2.3 HDR ✨
 Video-to-video HDR via LTX-2.3 + [HDR IC-LoRA](https://huggingface.co/diffusers-internal-dev/LTX-HDR-LoRA).
-Output is linear HDR (LogC3 inverse decoded — auto-detected from LoRA metadata). The preview mp4 is a fixed-EV sRGB tonemap; the EXR zip contains the full linear float frames for grading.
     """)
     with gr.Row():
         with gr.Column(scale=1):
             input_video = gr.Video(label="Source Video")
-            prompt = gr.Textbox(
-                label="Prompt",
-                info="Describes the scene being regenerated in HDR. Encoded through Gemma on each run.",
-                lines=2,
-                placeholder="a cinematic sunset over mountains, high dynamic range, bright sky, deep shadows",
-            )
             with gr.Row():
                 target_aspect = gr.Dropdown(
                     label="Aspect Ratio",
@@ -398,7 +421,7 @@ Output is linear HDR (LogC3 inverse decoded — auto-detected from LoRA metadata
     generate_btn.click(
         fn=generate_video,
         inputs=[
-            input_video, prompt, duration, frame_rate, target_aspect, high_res,
             seed, randomize_seed, high_quality_hdr, export_exr,
         ],
         outputs=[output_video, output_exr, seed],

     )
 # ─────────────────────────────────────────────────────────────────────────────
+# ltx-core / ltx-pipelines source (bundled — the HDR code path is not on
+# public Lightricks/LTX-2 main).
 # ─────────────────────────────────────────────────────────────────────────────
 LTX_INTERNAL = Path(os.environ.get(
     "LTX_INTERNAL_PATH",
 import random
 import tempfile
 import zipfile
+from collections.abc import Iterator
+from contextlib import contextmanager
 import torch
 torch._dynamo.config.suppress_errors = True
 import spaces
 import gradio as gr
 import numpy as np
+from huggingface_hub import hf_hub_download
+from ltx_core.model.upsampler import upsample_video
 from ltx_core.model.video_vae import TilingConfig
 from ltx_core.quantization import QuantizationPolicy
 from ltx_pipelines.hdr_ic_lora import HDRICLoraPipeline, _make_tiling_config
 from ltx_pipelines.utils.media_io import (
     encode_exr_sequence_to_mp4,
     get_videostream_metadata,
 )
 from ltx_pipelines.utils.types import OffloadMode
+# xformers attention patch
 from ltx_core.model.transformer import attention as _attn_mod
 print(f"[ATTN] Before patch: memory_efficient_attention={_attn_mod.memory_efficient_attention}")
 try:
 # ─────────────────────────────────────────────────────────────────────────────
 MAX_SEED = np.iinfo(np.int32).max
+# Canvas sizes divisible by 32; kept conservative for A10G 24 GB.
 RESOLUTIONS = {
     "low":  {"16:9": (768, 512), "9:16": (512, 768), "1:1": (768, 768),
              "4:3": (768, 576), "3:4": (576, 768), "21:9": (768, 384)},
 LTX_MODEL_REPO = "Lightricks/LTX-2.3"
 DISTILLED_CHECKPOINT = "ltx-2.3-22b-distilled-1.1.safetensors"
 SPATIAL_UPSCALER = "ltx-2.3-spatial-upscaler-x2-1.1.safetensors"
 HDR_LORA_REPO = "diffusers-internal-dev/LTX-HDR-LoRA"
 HDR_LORA_FILENAME = "comfyui_models_loras_ltxv_ltx2_ltx-2.3-22b-ic-lora-hdr-0.9 (4).safetensors"
+HDR_SCENE_EMB_FILENAME = "comfyui_models_loras_ltxv_ltx2_ltx-2.3-22b-ic-lora-hdr-scene-emb.pt"
 print("=" * 80)
+print("Downloading LTX-2.3 distilled + spatial upsampler + HDR IC-LoRA + scene emb...")
 print("=" * 80)
 checkpoint_path = hf_hub_download(repo_id=LTX_MODEL_REPO, filename=DISTILLED_CHECKPOINT)
 spatial_upsampler_path = hf_hub_download(repo_id=LTX_MODEL_REPO, filename=SPATIAL_UPSCALER)
 hdr_lora_path = hf_hub_download(repo_id=HDR_LORA_REPO, filename=HDR_LORA_FILENAME)
+hdr_scene_emb_path = hf_hub_download(repo_id=HDR_LORA_REPO, filename=HDR_SCENE_EMB_FILENAME)
 print(f"Checkpoint: {checkpoint_path}")
 print(f"Spatial upsampler: {spatial_upsampler_path}")
 print(f"HDR IC-LoRA: {hdr_lora_path}")
+print(f"HDR scene emb: {hdr_scene_emb_path}")
 # ─────────────────────────────────────────────────────────────────────────────
+# Initialize pipeline — text conditioning comes from the shipped scene-emb
+# .pt (no Gemma at inference time).
 # ─────────────────────────────────────────────────────────────────────────────
 _DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 _DTYPE = torch.bfloat16
 pipeline = HDRICLoraPipeline(
     distilled_checkpoint_path=checkpoint_path,
     spatial_upsampler_path=spatial_upsampler_path,
     hdr_lora=hdr_lora_path,
+    text_embeddings_path=hdr_scene_emb_path,
     quantization=QuantizationPolicy.fp8_cast(),
     offload_mode=OffloadMode.NONE,
 )
 print(f"HDRICLoraPipeline ready. HDR transform: {pipeline.hdr_transform}, "
       f"ref_downscale={pipeline.reference_downscale_factor}")
+# ─────────────────────────────────────────────────────────────────────────────
+# Pre-warm for ZeroGPU: build each component once at module load (ZeroGPU
+# tensor-packing captures the weights), then replace the pipeline's blocks
+# with tiny wrappers that reuse the cached models and skip gpu_model's
+# meta-device freeing. Avoids re-reading the 22B checkpoint + re-fusing
+# the LoRA + re-fp8-casting on every @spaces.GPU invocation.
+# ─────────────────────────────────────────────────────────────────────────────
+print("Pre-warming models (one-shot build)...")
+_cached_image_encoder = pipeline.image_conditioner._build_encoder()
+_cached_transformer = pipeline.stage_1._build_transformer()
+_cached_upsampler_encoder = (
+    pipeline.upsampler._encoder_builder
+    .build(device=_DEVICE, dtype=_DTYPE).to(_DEVICE).eval()
+)
+_cached_upsampler = (
+    pipeline.upsampler._upsampler_builder
+    .build(device=_DEVICE, dtype=_DTYPE).to(_DEVICE).eval()
+)
+_cached_video_decoder = (
+    pipeline.video_decoder._decoder_builder
+    .build(device=_DEVICE, dtype=_DTYPE).to(_DEVICE).eval()
+)
+@contextmanager
+def _yield_cached(model):
+    """Drop-in for gpu_model that does NOT move params to meta on exit."""
+    yield model
+# Patch the transformer context manager on both stages to yield the cached
+# transformer without freeing. stage_1 uses _transformer_ctx inside __call__;
+# stage_2 uses model_context() -> _transformer_ctx.
+def _cached_stage_ctx(**_kwargs):
+    return _yield_cached(_cached_transformer)
+pipeline.stage_1._transformer_ctx = _cached_stage_ctx
+pipeline.stage_2._transformer_ctx = _cached_stage_ctx
+class _CachedImageConditioner:
+    def __call__(self, fn):
+        return fn(_cached_image_encoder)
+class _CachedVideoUpsampler:
+    def __call__(self, latent):
+        return upsample_video(
+            latent=latent,
+            video_encoder=_cached_upsampler_encoder,
+            upsampler=_cached_upsampler,
+        )
+class _CachedVideoDecoder:
+    def __call__(
+        self,
+        latent: torch.Tensor,
+        tiling_config=None,
+        generator=None,
+        *,
+        output_dtype: torch.dtype = torch.uint8,
+    ) -> Iterator[torch.Tensor]:
+        return _cached_video_decoder.decode_video(
+            latent, tiling_config, generator, output_dtype=output_dtype,
+        )
+pipeline.image_conditioner = _CachedImageConditioner()
+pipeline.upsampler = _CachedVideoUpsampler()
+pipeline.video_decoder = _CachedVideoDecoder()
+print("Pre-warm complete.")
 print("=" * 80)
 @torch.inference_mode()
 def generate_video(
     input_video,
     duration: float,
     frame_rate: float,
     target_aspect: str,
         print(f"[HDR] {target_h}x{target_w}, frames={num_frames}, fps={frame_rate}, "
               f"seed={current_seed}, aspect={target_aspect}, hq_hdr={high_quality_hdr}")
+        # Smaller spatial tile on non-high-res to keep VAE decode within A10G budget.
         tiling_config = _make_tiling_config(spatial_tile=768 if not high_res else 1280)
         hdr_video = pipeline(
     gr.Markdown("""
 # LTX 2.3 HDR ✨
 Video-to-video HDR via LTX-2.3 + [HDR IC-LoRA](https://huggingface.co/diffusers-internal-dev/LTX-HDR-LoRA).
+Text conditioning uses the shipped pre-computed scene embedding (`scene-emb.pt`) — no prompt input, no per-call Gemma cost. Output is linear HDR (LogC3 inverse decoded, auto-detected from LoRA metadata). The preview mp4 is a fixed-EV sRGB tonemap; the EXR zip contains the full linear float frames for grading.
     """)
     with gr.Row():
         with gr.Column(scale=1):
             input_video = gr.Video(label="Source Video")
             with gr.Row():
                 target_aspect = gr.Dropdown(
                     label="Aspect Ratio",
     generate_btn.click(
         fn=generate_video,
         inputs=[
+            input_video, duration, frame_rate, target_aspect, high_res,
             seed, randomize_seed, high_quality_hdr, export_exr,
         ],
         outputs=[output_video, output_exr, seed],