LTX-2-3-hdr

Running on Zero

App Files Files Community

LTX-2-3-hdr / app.py

StatusReport

App: update HDR checkpoint path

e07c454 verified 25 days ago

raw

history blame contribute delete

19.7 kB

	import os
	import subprocess
	import sys
	from pathlib import Path

	# Disable torch.compile / dynamo before any torch import
	os.environ["TORCH_COMPILE_DISABLE"] = "1"
	os.environ["TORCHDYNAMO_DISABLE"] = "1"
	# OpenImageIO / OpenEXR readers need this to be set before cv2 import
	os.environ["OPENCV_IO_ENABLE_OPENEXR"] = "1"

	# Install xformers for memory-efficient attention
	subprocess.run(
	[sys.executable, "-m", "pip", "install", "xformers==0.0.32.post2", "--no-build-isolation"],
	check=False,
	)

	# Video / HDR dependencies
	subprocess.run(
	[sys.executable, "-m", "pip", "install",
	"imageio[ffmpeg]", "scikit-image", "opencv-python-headless",
	"decord", "num2words", "OpenImageIO"],
	check=False,
	)
	subprocess.run([sys.executable, "-m", "pip", "install", "num2words"], check=True)

	# Reinstall torchaudio to match the torch CUDA version on this space.
	_tv = subprocess.run(
	[sys.executable, "-c", "import torch; print(torch.__version__)"],
	capture_output=True, text=True,
	)
	if _tv.returncode == 0:
	_full_ver = _tv.stdout.strip()
	_cuda_suffix = _full_ver.split("+")[-1] if "+" in _full_ver else "cu124"
	_base_ver = _full_ver.split("+")[0]
	print(f"Detected torch {_full_ver}, reinstalling matching torchaudio...")
	subprocess.run(
	[sys.executable, "-m", "pip", "install", "--force-reinstall", "--no-deps",
	f"torchaudio=={_base_ver}",
	"--index-url", f"https://download.pytorch.org/whl/{_cuda_suffix}"],
	check=False,
	)

	# ─────────────────────────────────────────────────────────────────────────────
	# ltx-core / ltx-pipelines source
	#
	# The HDRICLoraPipeline and its supporting modules (ltx_core.hdr,
	# ltx_pipelines.utils.blocks, load_video_conditioning_hdr, apply_hdr_decode_postprocess,
	# save_exr_tensor, encode_exr_sequence_to_mp4) are NOT on the public main
	# branch at the pinned commit used by the outpaint app. We install from the
	# local ltx-2-internal checkout so the HDR code path actually exists.
	# ─────────────────────────────────────────────────────────────────────────────
	LTX_INTERNAL = Path(os.environ.get(
	"LTX_INTERNAL_PATH",
	str(Path(__file__).resolve().parent / "ltx-2-internal"),
	))
	LTX_CORE_PKG = LTX_INTERNAL / "packages" / "ltx-core"
	LTX_PIPELINES_PKG = LTX_INTERNAL / "packages" / "ltx-pipelines"

	print(f"Installing ltx-core + ltx-pipelines from {LTX_INTERNAL}...")
	subprocess.run(
	[sys.executable, "-m", "pip", "install",
	"--force-reinstall", "--no-deps",
	"-e", str(LTX_CORE_PKG),
	"-e", str(LTX_PIPELINES_PKG)],
	check=True,
	)
	sys.path.insert(0, str(LTX_PIPELINES_PKG / "src"))
	sys.path.insert(0, str(LTX_CORE_PKG / "src"))

	import logging
	import random
	import tempfile
	import zipfile

	import torch
	torch._dynamo.config.suppress_errors = True
	torch._dynamo.config.disable = True

	import spaces
	import gradio as gr
	import numpy as np
	from huggingface_hub import hf_hub_download, snapshot_download

	from ltx_core.model.video_vae import TilingConfig
	from ltx_core.quantization import QuantizationPolicy
	from ltx_pipelines.hdr_ic_lora import HDRICLoraPipeline, _make_tiling_config
	from ltx_pipelines.utils.blocks import PromptEncoder
	from ltx_pipelines.utils.media_io import (
	encode_exr_sequence_to_mp4,
	get_videostream_metadata,
	save_exr_tensor,
	)
	from ltx_pipelines.utils.types import OffloadMode

	# xformers attention patch (same as the outpaint app).
	from ltx_core.model.transformer import attention as _attn_mod
	print(f"[ATTN] Before patch: memory_efficient_attention={_attn_mod.memory_efficient_attention}")
	try:
	from xformers.ops import memory_efficient_attention as _mea
	_attn_mod.memory_efficient_attention = _mea
	print(f"[ATTN] After patch: memory_efficient_attention={_attn_mod.memory_efficient_attention}")
	except Exception as e:
	print(f"[ATTN] xformers patch FAILED: {type(e).__name__}: {e}")

	logging.getLogger().setLevel(logging.INFO)


	# ─────────────────────────────────────────────────────────────────────────────
	# Constants + model downloads
	# ─────────────────────────────────────────────────────────────────────────────
	MAX_SEED = np.iinfo(np.int32).max

	# Frames must satisfy (n-1) % 8 == 0. Aspect-ratio canvas sizes (divisible by 32).
	RESOLUTIONS = {
	"low": {"16:9": (768, 512), "9:16": (512, 768), "1:1": (768, 768),
	"4:3": (768, 576), "3:4": (576, 768), "21:9": (768, 384)},
	"high": {"16:9": (1536, 1024), "9:16": (1024, 1536), "1:1": (1024, 1024),
	"4:3": (1536, 1152), "3:4": (1152, 1536), "21:9": (1536, 768)},
	}

	LTX_MODEL_REPO = "Lightricks/LTX-2.3"
	DISTILLED_CHECKPOINT = "ltx-2.3-22b-distilled-1.1.safetensors"
	SPATIAL_UPSCALER = "ltx-2.3-spatial-upscaler-x2-1.1.safetensors"
	GEMMA_REPO = "google/gemma-3-12b-it-qat-q4_0-unquantized"

	HDR_LORA_REPO = "Lightricks/LTX-2.3-22b-IC-LoRA-HDR"
	HDR_LORA_FILENAME = "ltx-2.3-22b-ic-lora-hdr-0.9.safetensors"

	print("=" * 80)
	print("Downloading LTX-2.3 distilled + spatial upsampler + Gemma + HDR IC-LoRA...")
	print("=" * 80)

	checkpoint_path = hf_hub_download(repo_id=LTX_MODEL_REPO, filename=DISTILLED_CHECKPOINT)
	spatial_upsampler_path = hf_hub_download(repo_id=LTX_MODEL_REPO, filename=SPATIAL_UPSCALER)
	hdr_lora_path = hf_hub_download(repo_id=HDR_LORA_REPO, filename=HDR_LORA_FILENAME)
	gemma_root = snapshot_download(repo_id=GEMMA_REPO)

	print(f"Checkpoint: {checkpoint_path}")
	print(f"Spatial upsampler: {spatial_upsampler_path}")
	print(f"HDR IC-LoRA: {hdr_lora_path}")
	print(f"Gemma root: {gemma_root}")


	# ─────────────────────────────────────────────────────────────────────────────
	# Text encoding: on-the-fly Gemma -> (video_context, audio_context) for each
	# prompt. HDRICLoraPipeline expects a `.pt` path at __init__, so we bootstrap
	# one from an empty prompt, then overwrite `pipeline.text_embeddings` in
	# memory each generate call.
	# ─────────────────────────────────────────────────────────────────────────────
	_DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	_DTYPE = torch.bfloat16

	prompt_encoder = PromptEncoder(
	checkpoint_path=checkpoint_path,
	gemma_root=gemma_root,
	dtype=_DTYPE,
	device=_DEVICE,
	)


	def encode_prompt_to_contexts(prompt: str) -> tuple[torch.Tensor, torch.Tensor]:
	"""Run Gemma + embeddings processor to produce (video_context, audio_context).

	HDRICLoraPipeline only consumes video_context; audio_context is stored for
	shape-compat with the `.pt` interface but ignored during HDR generation.
	MUST be called from inside a @spaces.GPU context on ZeroGPU.
	"""
	(out,) = prompt_encoder([prompt])
	v = out.video_encoding
	a = out.audio_encoding if out.audio_encoding is not None else torch.zeros(0, device=v.device, dtype=v.dtype)
	return v, a


	# HDRICLoraPipeline.__init__ requires a .pt it can torch.load, but it only
	# stores the tensors — __call__ reads `self.text_embeddings` which we overwrite
	# on every generate run. So write a placeholder .pt at module-load (CPU, no
	# Gemma run — Gemma can only touch GPU inside a @spaces.GPU function on ZeroGPU).
	_bootstrap_emb_path = Path(tempfile.gettempdir()) / "ltx_hdr_bootstrap_emb.pt"
	_placeholder = torch.zeros(1, 1, 4096, dtype=_DTYPE)
	torch.save({"video_context": _placeholder, "audio_context": _placeholder}, _bootstrap_emb_path)


	# ─────────────────────────────────────────────────────────────────────────────
	# Initialize pipeline
	# ─────────────────────────────────────────────────────────────────────────────
	# HDRICLoraPipeline is video-only (no audio path). HDR transform (LogC3) and
	# reference_downscale_factor are auto-detected from the LoRA metadata.
	pipeline = HDRICLoraPipeline(
	distilled_checkpoint_path=checkpoint_path,
	spatial_upsampler_path=spatial_upsampler_path,
	hdr_lora=hdr_lora_path,
	text_embeddings_path=str(_bootstrap_emb_path),
	quantization=QuantizationPolicy.fp8_cast(),
	offload_mode=OffloadMode.NONE,
	)
	print(f"HDRICLoraPipeline ready. HDR transform: {pipeline.hdr_transform}, "
	f"ref_downscale={pipeline.reference_downscale_factor}")
	print("=" * 80)


	# ─────────────────────────────────────────────────────────────────────────────
	# UI helpers
	# ─────────────────────────────────────────────────────────────────────────────
	def detect_aspect_ratio(video_path) -> str:
	if video_path is None:
	return "16:9"
	try:
	meta = get_videostream_metadata(str(video_path))
	ratio = meta.width / meta.height
	except Exception:
	return "16:9"
	candidates = {
	"16:9": 16 / 9, "9:16": 9 / 16, "1:1": 1.0,
	"4:3": 4 / 3, "3:4": 3 / 4, "21:9": 21 / 9,
	}
	return min(candidates, key=lambda k: abs(ratio - candidates[k]))


	def get_target_resolution(target_aspect: str, high_res: bool) -> tuple[int, int]:
	tier = "high" if high_res else "low"
	return RESOLUTIONS[tier].get(target_aspect, RESOLUTIONS[tier]["16:9"])


	def on_video_upload(video):
	if video is None:
	return gr.update(), gr.update(), gr.update()
	try:
	meta = get_videostream_metadata(str(video))
	dur = round(min(meta.frames / meta.fps, 6.7), 1)
	fps = float(meta.fps)
	except Exception:
	dur, fps = 3.0, 24.0
	return gr.update(value=dur), gr.update(value=detect_aspect_ratio(video)), gr.update(value=fps)


	# ─────────────────────────────────────────────────────────────────────────────
	# Generation
	# ─────────────────────────────────────────────────────────────────────────────
	@spaces.GPU(duration=300)
	@torch.inference_mode()
	def generate_video(
	input_video,
	prompt: str,
	duration: float,
	frame_rate: float,
	target_aspect: str,
	high_res: bool,
	seed: int,
	randomize_seed: bool,
	high_quality_hdr: bool,
	export_exr: bool,
	progress=gr.Progress(track_tqdm=True),
	):
	current_seed = random.randint(0, MAX_SEED) if randomize_seed else int(seed)
	try:
	torch.cuda.reset_peak_memory_stats()

	if input_video is None:
	raise ValueError("Please upload a source video.")

	video_path = str(input_video)

	# Frame count must satisfy (n-1) % 8 == 0
	num_frames = int(duration * frame_rate) + 1
	num_frames = ((num_frames - 1 + 7) // 8) * 8 + 1

	target_w, target_h = get_target_resolution(target_aspect, high_res)

	print(f"[HDR] {target_h}x{target_w}, frames={num_frames}, fps={frame_rate}, "
	f"seed={current_seed}, aspect={target_aspect}, hq_hdr={high_quality_hdr}")

	# Encode prompt -> (video_context, audio_context) and swap into the
	# pipeline. Gemma is loaded, used, and freed inside prompt_encoder.
	print(f"[HDR] Encoding prompt: {prompt!r}")
	video_context, audio_context = encode_prompt_to_contexts(prompt or "")
	pipeline.text_embeddings = (video_context, audio_context)

	# Tiling config: smaller spatial tile on lower-VRAM targets
	tiling_config = _make_tiling_config(spatial_tile=768 if not high_res else 1280)

	hdr_video = pipeline(
	seed=current_seed,
	height=int(target_h),
	width=int(target_w),
	num_frames=num_frames,
	frame_rate=float(frame_rate),
	video_conditioning=[(video_path, 1.0)],
	tiling_config=tiling_config,
	high_quality_hdr=high_quality_hdr,
	)

	# The pipeline's internal resize_and_reflect_pad adds bottom/right
	# reflection when source aspect != target aspect. Its built-in crop
	# (_decode_video's `out[:, :crop_size[1], :crop_size[0], :]`) only
	# undoes the 64-divisor alignment padding — it leaves aspect-mismatch
	# reflection bands in the decoded output. Apply the same top-left
	# slice here with the un-reflected content region.
	try:
	src_meta = get_videostream_metadata(video_path)
	src_aspect = src_meta.width / src_meta.height
	tgt_aspect = target_w / target_h
	if src_aspect > tgt_aspect:
	content_h = int(round(target_w / src_aspect))
	content_h -= content_h % 2 # libx264 yuv420p needs even dims
	if 0 < content_h < hdr_video.shape[1]:
	print(f"[HDR] Cropping reflected bottom: {hdr_video.shape[1]} -> {content_h}")
	hdr_video = hdr_video[:, :content_h, :, :]
	elif src_aspect < tgt_aspect:
	content_w = int(round(target_h * src_aspect))
	content_w -= content_w % 2 # libx264 yuv420p needs even dims
	if 0 < content_w < hdr_video.shape[2]:
	print(f"[HDR] Cropping reflected right: {hdr_video.shape[2]} -> {content_w}")
	hdr_video = hdr_video[:, :, :content_w, :]
	except Exception as e:
	print(f"[HDR] Post-crop skipped: {type(e).__name__}: {e}")

	# hdr_video is [f, h, w, c] linear HDR float. Write EXR frames, then
	# tonemap to a libx264 mp4 for in-browser preview.
	work_dir = Path(tempfile.mkdtemp(prefix="ltx-hdr-"))
	exr_dir = work_dir / "exr"
	exr_dir.mkdir(parents=True, exist_ok=True)

	for j in range(hdr_video.shape[0]):
	save_exr_tensor(
	hdr_video[j].detach().cpu(),
	str(exr_dir / f"frame_{j:05d}.exr"),
	half=False,
	)
	del hdr_video

	preview_mp4 = work_dir / "preview.mp4"
	encode_exr_sequence_to_mp4(exr_dir, preview_mp4, frame_rate=float(frame_rate))

	exr_zip_path = None
	if export_exr:
	exr_zip_path = work_dir / "exr.zip"
	with zipfile.ZipFile(exr_zip_path, "w", compression=zipfile.ZIP_STORED) as zf:
	for exr in sorted(exr_dir.glob("frame_*.exr")):
	zf.write(exr, arcname=exr.name)

	return str(preview_mp4), (str(exr_zip_path) if exr_zip_path else None), current_seed

	except Exception as e:
	import traceback
	print(f"Error: {e}\n{traceback.format_exc()}")
	return None, None, current_seed


	# ─────────────────────────────────────────────────────────────────────────────
	# Gradio UI — LTX 2.3 HDR
	# ─────────────────────────────────────────────────────────────────────────────
	css = """
	.generate-btn { min-height: 52px !important; font-size: 1.1em !important; }
	footer { display: none !important; }
	video { object-fit: contain !important; }
	"""

	theme = gr.themes.Citrus(
	primary_hue=gr.themes.colors.purple,
	secondary_hue=gr.themes.colors.purple,
	neutral_hue=gr.themes.colors.gray,
	)

	with gr.Blocks(title="LTX 2.3 HDR", css=css, theme=theme) as demo:
	gr.Markdown("""
	# LTX 2.3 HDR ✨
	Reconstruct the highlights and shadows clipped away by the camera with LTX-2.3 HDR:
	Video-to-video HDR via LTX-2.3 + [HDR IC-LoRA](https://huggingface.co/diffusers-internal-dev/LTX-HDR-LoRA).
	Output is linear HDR (LogC3 inverse decoded).
	""")

	with gr.Row():
	with gr.Column(scale=1):
	input_video = gr.Video(label="Source Video")

	prompt = gr.Textbox(
	label="Prompt",
	info="Describe the scene being regenerated in HDR",
	lines=2,
	placeholder="a cinematic sunset over mountains, high dynamic range, bright sky, deep shadows",
	)

	with gr.Row():
	target_aspect = gr.Dropdown(
	label="Aspect Ratio",
	choices=["16:9", "9:16", "1:1", "4:3", "3:4", "21:9"],
	value="16:9",
	)
	duration = gr.Slider(
	label="Duration (s)", minimum=1.0, maximum=6.7, value=3.0, step=0.1,
	)
	frame_rate = gr.Number(label="FPS", value=24.0, precision=2)

	generate_btn = gr.Button(
	"Generate HDR", variant="primary", size="lg", elem_classes=["generate-btn"],
	)

	with gr.Accordion("Advanced Settings", open=False):
	high_res = gr.Checkbox(label="High Resolution (2×)", value=False)
	high_quality_hdr = gr.Checkbox(
	label="High-Quality HDR",
	value=False,
	info="Generates at 2× internal frame count, keeps every other frame. ~2× slower, smoother temporally.",
	)
	export_exr = gr.Checkbox(
	label="Export EXR frames (zip)",
	value=True,
	info="Per-frame linear OpenEXR (float32). Uncheck if you only want the mp4 preview.",
	)
	seed = gr.Slider(label="Seed", minimum=0, maximum=MAX_SEED, value=10, step=1)
	randomize_seed = gr.Checkbox(label="Randomize Seed", value=True)

	with gr.Column(scale=1):
	output_video = gr.Video(label="Preview (sRGB tonemap)", autoplay=True, height=480)
	gr.Markdown("The preview mp4 is a fixed-EV sRGB tonemap; the EXR zip contains the full linear float frames for grading.")
	output_exr = gr.File(label="EXR frames (zip)")

	input_video.change(
	fn=on_video_upload,
	inputs=[input_video],
	outputs=[duration, target_aspect, frame_rate],
	)

	generate_btn.click(
	fn=generate_video,
	inputs=[
	input_video, prompt, duration, frame_rate, target_aspect, high_res,
	seed, randomize_seed, high_quality_hdr, export_exr,
	],
	outputs=[output_video, output_exr, seed],
	)


	if __name__ == "__main__":
	demo.launch()