import os import subprocess import sys from pathlib import Path # Disable torch.compile / dynamo before any torch import os.environ["TORCH_COMPILE_DISABLE"] = "1" os.environ["TORCHDYNAMO_DISABLE"] = "1" # OpenImageIO / OpenEXR readers need this to be set before cv2 import os.environ["OPENCV_IO_ENABLE_OPENEXR"] = "1" # Install xformers for memory-efficient attention subprocess.run( [sys.executable, "-m", "pip", "install", "xformers==0.0.32.post2", "--no-build-isolation"], check=False, ) # Video / HDR dependencies subprocess.run( [sys.executable, "-m", "pip", "install", "imageio[ffmpeg]", "scikit-image", "opencv-python-headless", "decord", "num2words", "OpenImageIO"], check=False, ) subprocess.run([sys.executable, "-m", "pip", "install", "num2words"], check=True) # Reinstall torchaudio to match the torch CUDA version on this space. _tv = subprocess.run( [sys.executable, "-c", "import torch; print(torch.__version__)"], capture_output=True, text=True, ) if _tv.returncode == 0: _full_ver = _tv.stdout.strip() _cuda_suffix = _full_ver.split("+")[-1] if "+" in _full_ver else "cu124" _base_ver = _full_ver.split("+")[0] print(f"Detected torch {_full_ver}, reinstalling matching torchaudio...") subprocess.run( [sys.executable, "-m", "pip", "install", "--force-reinstall", "--no-deps", f"torchaudio=={_base_ver}", "--index-url", f"https://download.pytorch.org/whl/{_cuda_suffix}"], check=False, ) # ───────────────────────────────────────────────────────────────────────────── # ltx-core / ltx-pipelines source # # The HDRICLoraPipeline and its supporting modules (ltx_core.hdr, # ltx_pipelines.utils.blocks, load_video_conditioning_hdr, apply_hdr_decode_postprocess, # save_exr_tensor, encode_exr_sequence_to_mp4) are NOT on the public main # branch at the pinned commit used by the outpaint app. We install from the # local ltx-2-internal checkout so the HDR code path actually exists. # ───────────────────────────────────────────────────────────────────────────── LTX_INTERNAL = Path(os.environ.get( "LTX_INTERNAL_PATH", str(Path(__file__).resolve().parent / "ltx-2-internal"), )) LTX_CORE_PKG = LTX_INTERNAL / "packages" / "ltx-core" LTX_PIPELINES_PKG = LTX_INTERNAL / "packages" / "ltx-pipelines" print(f"Installing ltx-core + ltx-pipelines from {LTX_INTERNAL}...") subprocess.run( [sys.executable, "-m", "pip", "install", "--force-reinstall", "--no-deps", "-e", str(LTX_CORE_PKG), "-e", str(LTX_PIPELINES_PKG)], check=True, ) sys.path.insert(0, str(LTX_PIPELINES_PKG / "src")) sys.path.insert(0, str(LTX_CORE_PKG / "src")) import logging import random import tempfile import zipfile import torch torch._dynamo.config.suppress_errors = True torch._dynamo.config.disable = True import spaces import gradio as gr import numpy as np from huggingface_hub import hf_hub_download, snapshot_download from ltx_core.model.video_vae import TilingConfig from ltx_core.quantization import QuantizationPolicy from ltx_pipelines.hdr_ic_lora import HDRICLoraPipeline, _make_tiling_config from ltx_pipelines.utils.blocks import PromptEncoder from ltx_pipelines.utils.media_io import ( encode_exr_sequence_to_mp4, get_videostream_metadata, save_exr_tensor, ) from ltx_pipelines.utils.types import OffloadMode # xformers attention patch (same as the outpaint app). from ltx_core.model.transformer import attention as _attn_mod print(f"[ATTN] Before patch: memory_efficient_attention={_attn_mod.memory_efficient_attention}") try: from xformers.ops import memory_efficient_attention as _mea _attn_mod.memory_efficient_attention = _mea print(f"[ATTN] After patch: memory_efficient_attention={_attn_mod.memory_efficient_attention}") except Exception as e: print(f"[ATTN] xformers patch FAILED: {type(e).__name__}: {e}") logging.getLogger().setLevel(logging.INFO) # ───────────────────────────────────────────────────────────────────────────── # Constants + model downloads # ───────────────────────────────────────────────────────────────────────────── MAX_SEED = np.iinfo(np.int32).max # Frames must satisfy (n-1) % 8 == 0. Aspect-ratio canvas sizes (divisible by 32). RESOLUTIONS = { "low": {"16:9": (768, 512), "9:16": (512, 768), "1:1": (768, 768), "4:3": (768, 576), "3:4": (576, 768), "21:9": (768, 384)}, "high": {"16:9": (1536, 1024), "9:16": (1024, 1536), "1:1": (1024, 1024), "4:3": (1536, 1152), "3:4": (1152, 1536), "21:9": (1536, 768)}, } LTX_MODEL_REPO = "Lightricks/LTX-2.3" DISTILLED_CHECKPOINT = "ltx-2.3-22b-distilled-1.1.safetensors" SPATIAL_UPSCALER = "ltx-2.3-spatial-upscaler-x2-1.1.safetensors" GEMMA_REPO = "google/gemma-3-12b-it-qat-q4_0-unquantized" HDR_LORA_REPO = "Lightricks/LTX-2.3-22b-IC-LoRA-HDR" HDR_LORA_FILENAME = "ltx-2.3-22b-ic-lora-hdr-0.9.safetensors" print("=" * 80) print("Downloading LTX-2.3 distilled + spatial upsampler + Gemma + HDR IC-LoRA...") print("=" * 80) checkpoint_path = hf_hub_download(repo_id=LTX_MODEL_REPO, filename=DISTILLED_CHECKPOINT) spatial_upsampler_path = hf_hub_download(repo_id=LTX_MODEL_REPO, filename=SPATIAL_UPSCALER) hdr_lora_path = hf_hub_download(repo_id=HDR_LORA_REPO, filename=HDR_LORA_FILENAME) gemma_root = snapshot_download(repo_id=GEMMA_REPO) print(f"Checkpoint: {checkpoint_path}") print(f"Spatial upsampler: {spatial_upsampler_path}") print(f"HDR IC-LoRA: {hdr_lora_path}") print(f"Gemma root: {gemma_root}") # ───────────────────────────────────────────────────────────────────────────── # Text encoding: on-the-fly Gemma -> (video_context, audio_context) for each # prompt. HDRICLoraPipeline expects a `.pt` path at __init__, so we bootstrap # one from an empty prompt, then overwrite `pipeline.text_embeddings` in # memory each generate call. # ───────────────────────────────────────────────────────────────────────────── _DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") _DTYPE = torch.bfloat16 prompt_encoder = PromptEncoder( checkpoint_path=checkpoint_path, gemma_root=gemma_root, dtype=_DTYPE, device=_DEVICE, ) def encode_prompt_to_contexts(prompt: str) -> tuple[torch.Tensor, torch.Tensor]: """Run Gemma + embeddings processor to produce (video_context, audio_context). HDRICLoraPipeline only consumes video_context; audio_context is stored for shape-compat with the `.pt` interface but ignored during HDR generation. MUST be called from inside a @spaces.GPU context on ZeroGPU. """ (out,) = prompt_encoder([prompt]) v = out.video_encoding a = out.audio_encoding if out.audio_encoding is not None else torch.zeros(0, device=v.device, dtype=v.dtype) return v, a # HDRICLoraPipeline.__init__ requires a .pt it can torch.load, but it only # stores the tensors — __call__ reads `self.text_embeddings` which we overwrite # on every generate run. So write a placeholder .pt at module-load (CPU, no # Gemma run — Gemma can only touch GPU inside a @spaces.GPU function on ZeroGPU). _bootstrap_emb_path = Path(tempfile.gettempdir()) / "ltx_hdr_bootstrap_emb.pt" _placeholder = torch.zeros(1, 1, 4096, dtype=_DTYPE) torch.save({"video_context": _placeholder, "audio_context": _placeholder}, _bootstrap_emb_path) # ───────────────────────────────────────────────────────────────────────────── # Initialize pipeline # ───────────────────────────────────────────────────────────────────────────── # HDRICLoraPipeline is video-only (no audio path). HDR transform (LogC3) and # reference_downscale_factor are auto-detected from the LoRA metadata. pipeline = HDRICLoraPipeline( distilled_checkpoint_path=checkpoint_path, spatial_upsampler_path=spatial_upsampler_path, hdr_lora=hdr_lora_path, text_embeddings_path=str(_bootstrap_emb_path), quantization=QuantizationPolicy.fp8_cast(), offload_mode=OffloadMode.NONE, ) print(f"HDRICLoraPipeline ready. HDR transform: {pipeline.hdr_transform}, " f"ref_downscale={pipeline.reference_downscale_factor}") print("=" * 80) # ───────────────────────────────────────────────────────────────────────────── # UI helpers # ───────────────────────────────────────────────────────────────────────────── def detect_aspect_ratio(video_path) -> str: if video_path is None: return "16:9" try: meta = get_videostream_metadata(str(video_path)) ratio = meta.width / meta.height except Exception: return "16:9" candidates = { "16:9": 16 / 9, "9:16": 9 / 16, "1:1": 1.0, "4:3": 4 / 3, "3:4": 3 / 4, "21:9": 21 / 9, } return min(candidates, key=lambda k: abs(ratio - candidates[k])) def get_target_resolution(target_aspect: str, high_res: bool) -> tuple[int, int]: tier = "high" if high_res else "low" return RESOLUTIONS[tier].get(target_aspect, RESOLUTIONS[tier]["16:9"]) def on_video_upload(video): if video is None: return gr.update(), gr.update(), gr.update() try: meta = get_videostream_metadata(str(video)) dur = round(min(meta.frames / meta.fps, 6.7), 1) fps = float(meta.fps) except Exception: dur, fps = 3.0, 24.0 return gr.update(value=dur), gr.update(value=detect_aspect_ratio(video)), gr.update(value=fps) # ───────────────────────────────────────────────────────────────────────────── # Generation # ───────────────────────────────────────────────────────────────────────────── @spaces.GPU(duration=300) @torch.inference_mode() def generate_video( input_video, prompt: str, duration: float, frame_rate: float, target_aspect: str, high_res: bool, seed: int, randomize_seed: bool, high_quality_hdr: bool, export_exr: bool, progress=gr.Progress(track_tqdm=True), ): current_seed = random.randint(0, MAX_SEED) if randomize_seed else int(seed) try: torch.cuda.reset_peak_memory_stats() if input_video is None: raise ValueError("Please upload a source video.") video_path = str(input_video) # Frame count must satisfy (n-1) % 8 == 0 num_frames = int(duration * frame_rate) + 1 num_frames = ((num_frames - 1 + 7) // 8) * 8 + 1 target_w, target_h = get_target_resolution(target_aspect, high_res) print(f"[HDR] {target_h}x{target_w}, frames={num_frames}, fps={frame_rate}, " f"seed={current_seed}, aspect={target_aspect}, hq_hdr={high_quality_hdr}") # Encode prompt -> (video_context, audio_context) and swap into the # pipeline. Gemma is loaded, used, and freed inside prompt_encoder. print(f"[HDR] Encoding prompt: {prompt!r}") video_context, audio_context = encode_prompt_to_contexts(prompt or "") pipeline.text_embeddings = (video_context, audio_context) # Tiling config: smaller spatial tile on lower-VRAM targets tiling_config = _make_tiling_config(spatial_tile=768 if not high_res else 1280) hdr_video = pipeline( seed=current_seed, height=int(target_h), width=int(target_w), num_frames=num_frames, frame_rate=float(frame_rate), video_conditioning=[(video_path, 1.0)], tiling_config=tiling_config, high_quality_hdr=high_quality_hdr, ) # The pipeline's internal resize_and_reflect_pad adds bottom/right # reflection when source aspect != target aspect. Its built-in crop # (_decode_video's `out[:, :crop_size[1], :crop_size[0], :]`) only # undoes the 64-divisor alignment padding — it leaves aspect-mismatch # reflection bands in the decoded output. Apply the same top-left # slice here with the un-reflected content region. try: src_meta = get_videostream_metadata(video_path) src_aspect = src_meta.width / src_meta.height tgt_aspect = target_w / target_h if src_aspect > tgt_aspect: content_h = int(round(target_w / src_aspect)) content_h -= content_h % 2 # libx264 yuv420p needs even dims if 0 < content_h < hdr_video.shape[1]: print(f"[HDR] Cropping reflected bottom: {hdr_video.shape[1]} -> {content_h}") hdr_video = hdr_video[:, :content_h, :, :] elif src_aspect < tgt_aspect: content_w = int(round(target_h * src_aspect)) content_w -= content_w % 2 # libx264 yuv420p needs even dims if 0 < content_w < hdr_video.shape[2]: print(f"[HDR] Cropping reflected right: {hdr_video.shape[2]} -> {content_w}") hdr_video = hdr_video[:, :, :content_w, :] except Exception as e: print(f"[HDR] Post-crop skipped: {type(e).__name__}: {e}") # hdr_video is [f, h, w, c] linear HDR float. Write EXR frames, then # tonemap to a libx264 mp4 for in-browser preview. work_dir = Path(tempfile.mkdtemp(prefix="ltx-hdr-")) exr_dir = work_dir / "exr" exr_dir.mkdir(parents=True, exist_ok=True) for j in range(hdr_video.shape[0]): save_exr_tensor( hdr_video[j].detach().cpu(), str(exr_dir / f"frame_{j:05d}.exr"), half=False, ) del hdr_video preview_mp4 = work_dir / "preview.mp4" encode_exr_sequence_to_mp4(exr_dir, preview_mp4, frame_rate=float(frame_rate)) exr_zip_path = None if export_exr: exr_zip_path = work_dir / "exr.zip" with zipfile.ZipFile(exr_zip_path, "w", compression=zipfile.ZIP_STORED) as zf: for exr in sorted(exr_dir.glob("frame_*.exr")): zf.write(exr, arcname=exr.name) return str(preview_mp4), (str(exr_zip_path) if exr_zip_path else None), current_seed except Exception as e: import traceback print(f"Error: {e}\n{traceback.format_exc()}") return None, None, current_seed # ───────────────────────────────────────────────────────────────────────────── # Gradio UI — LTX 2.3 HDR # ───────────────────────────────────────────────────────────────────────────── css = """ .generate-btn { min-height: 52px !important; font-size: 1.1em !important; } footer { display: none !important; } video { object-fit: contain !important; } """ theme = gr.themes.Citrus( primary_hue=gr.themes.colors.purple, secondary_hue=gr.themes.colors.purple, neutral_hue=gr.themes.colors.gray, ) with gr.Blocks(title="LTX 2.3 HDR", css=css, theme=theme) as demo: gr.Markdown(""" # LTX 2.3 HDR ✨ Reconstruct the highlights and shadows clipped away by the camera with LTX-2.3 HDR: Video-to-video HDR via LTX-2.3 + [HDR IC-LoRA](https://huggingface.co/diffusers-internal-dev/LTX-HDR-LoRA). Output is linear HDR (LogC3 inverse decoded). """) with gr.Row(): with gr.Column(scale=1): input_video = gr.Video(label="Source Video") prompt = gr.Textbox( label="Prompt", info="Describe the scene being regenerated in HDR", lines=2, placeholder="a cinematic sunset over mountains, high dynamic range, bright sky, deep shadows", ) with gr.Row(): target_aspect = gr.Dropdown( label="Aspect Ratio", choices=["16:9", "9:16", "1:1", "4:3", "3:4", "21:9"], value="16:9", ) duration = gr.Slider( label="Duration (s)", minimum=1.0, maximum=6.7, value=3.0, step=0.1, ) frame_rate = gr.Number(label="FPS", value=24.0, precision=2) generate_btn = gr.Button( "Generate HDR", variant="primary", size="lg", elem_classes=["generate-btn"], ) with gr.Accordion("Advanced Settings", open=False): high_res = gr.Checkbox(label="High Resolution (2×)", value=False) high_quality_hdr = gr.Checkbox( label="High-Quality HDR", value=False, info="Generates at 2× internal frame count, keeps every other frame. ~2× slower, smoother temporally.", ) export_exr = gr.Checkbox( label="Export EXR frames (zip)", value=True, info="Per-frame linear OpenEXR (float32). Uncheck if you only want the mp4 preview.", ) seed = gr.Slider(label="Seed", minimum=0, maximum=MAX_SEED, value=10, step=1) randomize_seed = gr.Checkbox(label="Randomize Seed", value=True) with gr.Column(scale=1): output_video = gr.Video(label="Preview (sRGB tonemap)", autoplay=True, height=480) gr.Markdown("The preview mp4 is a fixed-EV sRGB tonemap; the EXR zip contains the full linear float frames for grading.") output_exr = gr.File(label="EXR frames (zip)") input_video.change( fn=on_video_upload, inputs=[input_video], outputs=[duration, target_aspect, frame_rate], ) generate_btn.click( fn=generate_video, inputs=[ input_video, prompt, duration, frame_rate, target_aspect, high_res, seed, randomize_seed, high_quality_hdr, export_exr, ], outputs=[output_video, output_exr, seed], ) if __name__ == "__main__": demo.launch()