LTX-2-3-hdr / app.py
StatusReport's picture
App: update HDR checkpoint path
e07c454 verified
import os
import subprocess
import sys
from pathlib import Path
# Disable torch.compile / dynamo before any torch import
os.environ["TORCH_COMPILE_DISABLE"] = "1"
os.environ["TORCHDYNAMO_DISABLE"] = "1"
# OpenImageIO / OpenEXR readers need this to be set before cv2 import
os.environ["OPENCV_IO_ENABLE_OPENEXR"] = "1"
# Install xformers for memory-efficient attention
subprocess.run(
[sys.executable, "-m", "pip", "install", "xformers==0.0.32.post2", "--no-build-isolation"],
check=False,
)
# Video / HDR dependencies
subprocess.run(
[sys.executable, "-m", "pip", "install",
"imageio[ffmpeg]", "scikit-image", "opencv-python-headless",
"decord", "num2words", "OpenImageIO"],
check=False,
)
subprocess.run([sys.executable, "-m", "pip", "install", "num2words"], check=True)
# Reinstall torchaudio to match the torch CUDA version on this space.
_tv = subprocess.run(
[sys.executable, "-c", "import torch; print(torch.__version__)"],
capture_output=True, text=True,
)
if _tv.returncode == 0:
_full_ver = _tv.stdout.strip()
_cuda_suffix = _full_ver.split("+")[-1] if "+" in _full_ver else "cu124"
_base_ver = _full_ver.split("+")[0]
print(f"Detected torch {_full_ver}, reinstalling matching torchaudio...")
subprocess.run(
[sys.executable, "-m", "pip", "install", "--force-reinstall", "--no-deps",
f"torchaudio=={_base_ver}",
"--index-url", f"https://download.pytorch.org/whl/{_cuda_suffix}"],
check=False,
)
# ─────────────────────────────────────────────────────────────────────────────
# ltx-core / ltx-pipelines source
#
# The HDRICLoraPipeline and its supporting modules (ltx_core.hdr,
# ltx_pipelines.utils.blocks, load_video_conditioning_hdr, apply_hdr_decode_postprocess,
# save_exr_tensor, encode_exr_sequence_to_mp4) are NOT on the public main
# branch at the pinned commit used by the outpaint app. We install from the
# local ltx-2-internal checkout so the HDR code path actually exists.
# ─────────────────────────────────────────────────────────────────────────────
LTX_INTERNAL = Path(os.environ.get(
"LTX_INTERNAL_PATH",
str(Path(__file__).resolve().parent / "ltx-2-internal"),
))
LTX_CORE_PKG = LTX_INTERNAL / "packages" / "ltx-core"
LTX_PIPELINES_PKG = LTX_INTERNAL / "packages" / "ltx-pipelines"
print(f"Installing ltx-core + ltx-pipelines from {LTX_INTERNAL}...")
subprocess.run(
[sys.executable, "-m", "pip", "install",
"--force-reinstall", "--no-deps",
"-e", str(LTX_CORE_PKG),
"-e", str(LTX_PIPELINES_PKG)],
check=True,
)
sys.path.insert(0, str(LTX_PIPELINES_PKG / "src"))
sys.path.insert(0, str(LTX_CORE_PKG / "src"))
import logging
import random
import tempfile
import zipfile
import torch
torch._dynamo.config.suppress_errors = True
torch._dynamo.config.disable = True
import spaces
import gradio as gr
import numpy as np
from huggingface_hub import hf_hub_download, snapshot_download
from ltx_core.model.video_vae import TilingConfig
from ltx_core.quantization import QuantizationPolicy
from ltx_pipelines.hdr_ic_lora import HDRICLoraPipeline, _make_tiling_config
from ltx_pipelines.utils.blocks import PromptEncoder
from ltx_pipelines.utils.media_io import (
encode_exr_sequence_to_mp4,
get_videostream_metadata,
save_exr_tensor,
)
from ltx_pipelines.utils.types import OffloadMode
# xformers attention patch (same as the outpaint app).
from ltx_core.model.transformer import attention as _attn_mod
print(f"[ATTN] Before patch: memory_efficient_attention={_attn_mod.memory_efficient_attention}")
try:
from xformers.ops import memory_efficient_attention as _mea
_attn_mod.memory_efficient_attention = _mea
print(f"[ATTN] After patch: memory_efficient_attention={_attn_mod.memory_efficient_attention}")
except Exception as e:
print(f"[ATTN] xformers patch FAILED: {type(e).__name__}: {e}")
logging.getLogger().setLevel(logging.INFO)
# ─────────────────────────────────────────────────────────────────────────────
# Constants + model downloads
# ─────────────────────────────────────────────────────────────────────────────
MAX_SEED = np.iinfo(np.int32).max
# Frames must satisfy (n-1) % 8 == 0. Aspect-ratio canvas sizes (divisible by 32).
RESOLUTIONS = {
"low": {"16:9": (768, 512), "9:16": (512, 768), "1:1": (768, 768),
"4:3": (768, 576), "3:4": (576, 768), "21:9": (768, 384)},
"high": {"16:9": (1536, 1024), "9:16": (1024, 1536), "1:1": (1024, 1024),
"4:3": (1536, 1152), "3:4": (1152, 1536), "21:9": (1536, 768)},
}
LTX_MODEL_REPO = "Lightricks/LTX-2.3"
DISTILLED_CHECKPOINT = "ltx-2.3-22b-distilled-1.1.safetensors"
SPATIAL_UPSCALER = "ltx-2.3-spatial-upscaler-x2-1.1.safetensors"
GEMMA_REPO = "google/gemma-3-12b-it-qat-q4_0-unquantized"
HDR_LORA_REPO = "Lightricks/LTX-2.3-22b-IC-LoRA-HDR"
HDR_LORA_FILENAME = "ltx-2.3-22b-ic-lora-hdr-0.9.safetensors"
print("=" * 80)
print("Downloading LTX-2.3 distilled + spatial upsampler + Gemma + HDR IC-LoRA...")
print("=" * 80)
checkpoint_path = hf_hub_download(repo_id=LTX_MODEL_REPO, filename=DISTILLED_CHECKPOINT)
spatial_upsampler_path = hf_hub_download(repo_id=LTX_MODEL_REPO, filename=SPATIAL_UPSCALER)
hdr_lora_path = hf_hub_download(repo_id=HDR_LORA_REPO, filename=HDR_LORA_FILENAME)
gemma_root = snapshot_download(repo_id=GEMMA_REPO)
print(f"Checkpoint: {checkpoint_path}")
print(f"Spatial upsampler: {spatial_upsampler_path}")
print(f"HDR IC-LoRA: {hdr_lora_path}")
print(f"Gemma root: {gemma_root}")
# ─────────────────────────────────────────────────────────────────────────────
# Text encoding: on-the-fly Gemma -> (video_context, audio_context) for each
# prompt. HDRICLoraPipeline expects a `.pt` path at __init__, so we bootstrap
# one from an empty prompt, then overwrite `pipeline.text_embeddings` in
# memory each generate call.
# ─────────────────────────────────────────────────────────────────────────────
_DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
_DTYPE = torch.bfloat16
prompt_encoder = PromptEncoder(
checkpoint_path=checkpoint_path,
gemma_root=gemma_root,
dtype=_DTYPE,
device=_DEVICE,
)
def encode_prompt_to_contexts(prompt: str) -> tuple[torch.Tensor, torch.Tensor]:
"""Run Gemma + embeddings processor to produce (video_context, audio_context).
HDRICLoraPipeline only consumes video_context; audio_context is stored for
shape-compat with the `.pt` interface but ignored during HDR generation.
MUST be called from inside a @spaces.GPU context on ZeroGPU.
"""
(out,) = prompt_encoder([prompt])
v = out.video_encoding
a = out.audio_encoding if out.audio_encoding is not None else torch.zeros(0, device=v.device, dtype=v.dtype)
return v, a
# HDRICLoraPipeline.__init__ requires a .pt it can torch.load, but it only
# stores the tensors β€” __call__ reads `self.text_embeddings` which we overwrite
# on every generate run. So write a placeholder .pt at module-load (CPU, no
# Gemma run β€” Gemma can only touch GPU inside a @spaces.GPU function on ZeroGPU).
_bootstrap_emb_path = Path(tempfile.gettempdir()) / "ltx_hdr_bootstrap_emb.pt"
_placeholder = torch.zeros(1, 1, 4096, dtype=_DTYPE)
torch.save({"video_context": _placeholder, "audio_context": _placeholder}, _bootstrap_emb_path)
# ─────────────────────────────────────────────────────────────────────────────
# Initialize pipeline
# ─────────────────────────────────────────────────────────────────────────────
# HDRICLoraPipeline is video-only (no audio path). HDR transform (LogC3) and
# reference_downscale_factor are auto-detected from the LoRA metadata.
pipeline = HDRICLoraPipeline(
distilled_checkpoint_path=checkpoint_path,
spatial_upsampler_path=spatial_upsampler_path,
hdr_lora=hdr_lora_path,
text_embeddings_path=str(_bootstrap_emb_path),
quantization=QuantizationPolicy.fp8_cast(),
offload_mode=OffloadMode.NONE,
)
print(f"HDRICLoraPipeline ready. HDR transform: {pipeline.hdr_transform}, "
f"ref_downscale={pipeline.reference_downscale_factor}")
print("=" * 80)
# ─────────────────────────────────────────────────────────────────────────────
# UI helpers
# ─────────────────────────────────────────────────────────────────────────────
def detect_aspect_ratio(video_path) -> str:
if video_path is None:
return "16:9"
try:
meta = get_videostream_metadata(str(video_path))
ratio = meta.width / meta.height
except Exception:
return "16:9"
candidates = {
"16:9": 16 / 9, "9:16": 9 / 16, "1:1": 1.0,
"4:3": 4 / 3, "3:4": 3 / 4, "21:9": 21 / 9,
}
return min(candidates, key=lambda k: abs(ratio - candidates[k]))
def get_target_resolution(target_aspect: str, high_res: bool) -> tuple[int, int]:
tier = "high" if high_res else "low"
return RESOLUTIONS[tier].get(target_aspect, RESOLUTIONS[tier]["16:9"])
def on_video_upload(video):
if video is None:
return gr.update(), gr.update(), gr.update()
try:
meta = get_videostream_metadata(str(video))
dur = round(min(meta.frames / meta.fps, 6.7), 1)
fps = float(meta.fps)
except Exception:
dur, fps = 3.0, 24.0
return gr.update(value=dur), gr.update(value=detect_aspect_ratio(video)), gr.update(value=fps)
# ─────────────────────────────────────────────────────────────────────────────
# Generation
# ─────────────────────────────────────────────────────────────────────────────
@spaces.GPU(duration=300)
@torch.inference_mode()
def generate_video(
input_video,
prompt: str,
duration: float,
frame_rate: float,
target_aspect: str,
high_res: bool,
seed: int,
randomize_seed: bool,
high_quality_hdr: bool,
export_exr: bool,
progress=gr.Progress(track_tqdm=True),
):
current_seed = random.randint(0, MAX_SEED) if randomize_seed else int(seed)
try:
torch.cuda.reset_peak_memory_stats()
if input_video is None:
raise ValueError("Please upload a source video.")
video_path = str(input_video)
# Frame count must satisfy (n-1) % 8 == 0
num_frames = int(duration * frame_rate) + 1
num_frames = ((num_frames - 1 + 7) // 8) * 8 + 1
target_w, target_h = get_target_resolution(target_aspect, high_res)
print(f"[HDR] {target_h}x{target_w}, frames={num_frames}, fps={frame_rate}, "
f"seed={current_seed}, aspect={target_aspect}, hq_hdr={high_quality_hdr}")
# Encode prompt -> (video_context, audio_context) and swap into the
# pipeline. Gemma is loaded, used, and freed inside prompt_encoder.
print(f"[HDR] Encoding prompt: {prompt!r}")
video_context, audio_context = encode_prompt_to_contexts(prompt or "")
pipeline.text_embeddings = (video_context, audio_context)
# Tiling config: smaller spatial tile on lower-VRAM targets
tiling_config = _make_tiling_config(spatial_tile=768 if not high_res else 1280)
hdr_video = pipeline(
seed=current_seed,
height=int(target_h),
width=int(target_w),
num_frames=num_frames,
frame_rate=float(frame_rate),
video_conditioning=[(video_path, 1.0)],
tiling_config=tiling_config,
high_quality_hdr=high_quality_hdr,
)
# The pipeline's internal resize_and_reflect_pad adds bottom/right
# reflection when source aspect != target aspect. Its built-in crop
# (_decode_video's `out[:, :crop_size[1], :crop_size[0], :]`) only
# undoes the 64-divisor alignment padding β€” it leaves aspect-mismatch
# reflection bands in the decoded output. Apply the same top-left
# slice here with the un-reflected content region.
try:
src_meta = get_videostream_metadata(video_path)
src_aspect = src_meta.width / src_meta.height
tgt_aspect = target_w / target_h
if src_aspect > tgt_aspect:
content_h = int(round(target_w / src_aspect))
content_h -= content_h % 2 # libx264 yuv420p needs even dims
if 0 < content_h < hdr_video.shape[1]:
print(f"[HDR] Cropping reflected bottom: {hdr_video.shape[1]} -> {content_h}")
hdr_video = hdr_video[:, :content_h, :, :]
elif src_aspect < tgt_aspect:
content_w = int(round(target_h * src_aspect))
content_w -= content_w % 2 # libx264 yuv420p needs even dims
if 0 < content_w < hdr_video.shape[2]:
print(f"[HDR] Cropping reflected right: {hdr_video.shape[2]} -> {content_w}")
hdr_video = hdr_video[:, :, :content_w, :]
except Exception as e:
print(f"[HDR] Post-crop skipped: {type(e).__name__}: {e}")
# hdr_video is [f, h, w, c] linear HDR float. Write EXR frames, then
# tonemap to a libx264 mp4 for in-browser preview.
work_dir = Path(tempfile.mkdtemp(prefix="ltx-hdr-"))
exr_dir = work_dir / "exr"
exr_dir.mkdir(parents=True, exist_ok=True)
for j in range(hdr_video.shape[0]):
save_exr_tensor(
hdr_video[j].detach().cpu(),
str(exr_dir / f"frame_{j:05d}.exr"),
half=False,
)
del hdr_video
preview_mp4 = work_dir / "preview.mp4"
encode_exr_sequence_to_mp4(exr_dir, preview_mp4, frame_rate=float(frame_rate))
exr_zip_path = None
if export_exr:
exr_zip_path = work_dir / "exr.zip"
with zipfile.ZipFile(exr_zip_path, "w", compression=zipfile.ZIP_STORED) as zf:
for exr in sorted(exr_dir.glob("frame_*.exr")):
zf.write(exr, arcname=exr.name)
return str(preview_mp4), (str(exr_zip_path) if exr_zip_path else None), current_seed
except Exception as e:
import traceback
print(f"Error: {e}\n{traceback.format_exc()}")
return None, None, current_seed
# ─────────────────────────────────────────────────────────────────────────────
# Gradio UI β€” LTX 2.3 HDR
# ─────────────────────────────────────────────────────────────────────────────
css = """
.generate-btn { min-height: 52px !important; font-size: 1.1em !important; }
footer { display: none !important; }
video { object-fit: contain !important; }
"""
theme = gr.themes.Citrus(
primary_hue=gr.themes.colors.purple,
secondary_hue=gr.themes.colors.purple,
neutral_hue=gr.themes.colors.gray,
)
with gr.Blocks(title="LTX 2.3 HDR", css=css, theme=theme) as demo:
gr.Markdown("""
# LTX 2.3 HDR ✨
Reconstruct the highlights and shadows clipped away by the camera with LTX-2.3 HDR:
Video-to-video HDR via LTX-2.3 + [HDR IC-LoRA](https://huggingface.co/diffusers-internal-dev/LTX-HDR-LoRA).
Output is linear HDR (LogC3 inverse decoded).
""")
with gr.Row():
with gr.Column(scale=1):
input_video = gr.Video(label="Source Video")
prompt = gr.Textbox(
label="Prompt",
info="Describe the scene being regenerated in HDR",
lines=2,
placeholder="a cinematic sunset over mountains, high dynamic range, bright sky, deep shadows",
)
with gr.Row():
target_aspect = gr.Dropdown(
label="Aspect Ratio",
choices=["16:9", "9:16", "1:1", "4:3", "3:4", "21:9"],
value="16:9",
)
duration = gr.Slider(
label="Duration (s)", minimum=1.0, maximum=6.7, value=3.0, step=0.1,
)
frame_rate = gr.Number(label="FPS", value=24.0, precision=2)
generate_btn = gr.Button(
"Generate HDR", variant="primary", size="lg", elem_classes=["generate-btn"],
)
with gr.Accordion("Advanced Settings", open=False):
high_res = gr.Checkbox(label="High Resolution (2Γ—)", value=False)
high_quality_hdr = gr.Checkbox(
label="High-Quality HDR",
value=False,
info="Generates at 2Γ— internal frame count, keeps every other frame. ~2Γ— slower, smoother temporally.",
)
export_exr = gr.Checkbox(
label="Export EXR frames (zip)",
value=True,
info="Per-frame linear OpenEXR (float32). Uncheck if you only want the mp4 preview.",
)
seed = gr.Slider(label="Seed", minimum=0, maximum=MAX_SEED, value=10, step=1)
randomize_seed = gr.Checkbox(label="Randomize Seed", value=True)
with gr.Column(scale=1):
output_video = gr.Video(label="Preview (sRGB tonemap)", autoplay=True, height=480)
gr.Markdown("The preview mp4 is a fixed-EV sRGB tonemap; the EXR zip contains the full linear float frames for grading.")
output_exr = gr.File(label="EXR frames (zip)")
input_video.change(
fn=on_video_upload,
inputs=[input_video],
outputs=[duration, target_aspect, frame_rate],
)
generate_btn.click(
fn=generate_video,
inputs=[
input_video, prompt, duration, frame_rate, target_aspect, high_res,
seed, randomize_seed, high_quality_hdr, export_exr,
],
outputs=[output_video, output_exr, seed],
)
if __name__ == "__main__":
demo.launch()