Spaces:
Running on Zero
Running on Zero
| import os | |
| import subprocess | |
| import sys | |
| from pathlib import Path | |
| # Disable torch.compile / dynamo before any torch import | |
| os.environ["TORCH_COMPILE_DISABLE"] = "1" | |
| os.environ["TORCHDYNAMO_DISABLE"] = "1" | |
| # OpenImageIO / OpenEXR readers need this to be set before cv2 import | |
| os.environ["OPENCV_IO_ENABLE_OPENEXR"] = "1" | |
| # Install xformers for memory-efficient attention | |
| subprocess.run( | |
| [sys.executable, "-m", "pip", "install", "xformers==0.0.32.post2", "--no-build-isolation"], | |
| check=False, | |
| ) | |
| # Video / HDR dependencies | |
| subprocess.run( | |
| [sys.executable, "-m", "pip", "install", | |
| "imageio[ffmpeg]", "scikit-image", "opencv-python-headless", | |
| "decord", "num2words", "OpenImageIO"], | |
| check=False, | |
| ) | |
| subprocess.run([sys.executable, "-m", "pip", "install", "num2words"], check=True) | |
| # Reinstall torchaudio to match the torch CUDA version on this space. | |
| _tv = subprocess.run( | |
| [sys.executable, "-c", "import torch; print(torch.__version__)"], | |
| capture_output=True, text=True, | |
| ) | |
| if _tv.returncode == 0: | |
| _full_ver = _tv.stdout.strip() | |
| _cuda_suffix = _full_ver.split("+")[-1] if "+" in _full_ver else "cu124" | |
| _base_ver = _full_ver.split("+")[0] | |
| print(f"Detected torch {_full_ver}, reinstalling matching torchaudio...") | |
| subprocess.run( | |
| [sys.executable, "-m", "pip", "install", "--force-reinstall", "--no-deps", | |
| f"torchaudio=={_base_ver}", | |
| "--index-url", f"https://download.pytorch.org/whl/{_cuda_suffix}"], | |
| check=False, | |
| ) | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # ltx-core / ltx-pipelines source | |
| # | |
| # The HDRICLoraPipeline and its supporting modules (ltx_core.hdr, | |
| # ltx_pipelines.utils.blocks, load_video_conditioning_hdr, apply_hdr_decode_postprocess, | |
| # save_exr_tensor, encode_exr_sequence_to_mp4) are NOT on the public main | |
| # branch at the pinned commit used by the outpaint app. We install from the | |
| # local ltx-2-internal checkout so the HDR code path actually exists. | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| LTX_INTERNAL = Path(os.environ.get( | |
| "LTX_INTERNAL_PATH", | |
| str(Path(__file__).resolve().parent / "ltx-2-internal"), | |
| )) | |
| LTX_CORE_PKG = LTX_INTERNAL / "packages" / "ltx-core" | |
| LTX_PIPELINES_PKG = LTX_INTERNAL / "packages" / "ltx-pipelines" | |
| print(f"Installing ltx-core + ltx-pipelines from {LTX_INTERNAL}...") | |
| subprocess.run( | |
| [sys.executable, "-m", "pip", "install", | |
| "--force-reinstall", "--no-deps", | |
| "-e", str(LTX_CORE_PKG), | |
| "-e", str(LTX_PIPELINES_PKG)], | |
| check=True, | |
| ) | |
| sys.path.insert(0, str(LTX_PIPELINES_PKG / "src")) | |
| sys.path.insert(0, str(LTX_CORE_PKG / "src")) | |
| import logging | |
| import random | |
| import tempfile | |
| import zipfile | |
| import torch | |
| torch._dynamo.config.suppress_errors = True | |
| torch._dynamo.config.disable = True | |
| import spaces | |
| import gradio as gr | |
| import numpy as np | |
| from huggingface_hub import hf_hub_download, snapshot_download | |
| from ltx_core.model.video_vae import TilingConfig | |
| from ltx_core.quantization import QuantizationPolicy | |
| from ltx_pipelines.hdr_ic_lora import HDRICLoraPipeline, _make_tiling_config | |
| from ltx_pipelines.utils.blocks import PromptEncoder | |
| from ltx_pipelines.utils.media_io import ( | |
| encode_exr_sequence_to_mp4, | |
| get_videostream_metadata, | |
| save_exr_tensor, | |
| ) | |
| from ltx_pipelines.utils.types import OffloadMode | |
| # xformers attention patch (same as the outpaint app). | |
| from ltx_core.model.transformer import attention as _attn_mod | |
| print(f"[ATTN] Before patch: memory_efficient_attention={_attn_mod.memory_efficient_attention}") | |
| try: | |
| from xformers.ops import memory_efficient_attention as _mea | |
| _attn_mod.memory_efficient_attention = _mea | |
| print(f"[ATTN] After patch: memory_efficient_attention={_attn_mod.memory_efficient_attention}") | |
| except Exception as e: | |
| print(f"[ATTN] xformers patch FAILED: {type(e).__name__}: {e}") | |
| logging.getLogger().setLevel(logging.INFO) | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Constants + model downloads | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| MAX_SEED = np.iinfo(np.int32).max | |
| # Frames must satisfy (n-1) % 8 == 0. Aspect-ratio canvas sizes (divisible by 32). | |
| RESOLUTIONS = { | |
| "low": {"16:9": (768, 512), "9:16": (512, 768), "1:1": (768, 768), | |
| "4:3": (768, 576), "3:4": (576, 768), "21:9": (768, 384)}, | |
| "high": {"16:9": (1536, 1024), "9:16": (1024, 1536), "1:1": (1024, 1024), | |
| "4:3": (1536, 1152), "3:4": (1152, 1536), "21:9": (1536, 768)}, | |
| } | |
| LTX_MODEL_REPO = "Lightricks/LTX-2.3" | |
| DISTILLED_CHECKPOINT = "ltx-2.3-22b-distilled-1.1.safetensors" | |
| SPATIAL_UPSCALER = "ltx-2.3-spatial-upscaler-x2-1.1.safetensors" | |
| GEMMA_REPO = "google/gemma-3-12b-it-qat-q4_0-unquantized" | |
| HDR_LORA_REPO = "Lightricks/LTX-2.3-22b-IC-LoRA-HDR" | |
| HDR_LORA_FILENAME = "ltx-2.3-22b-ic-lora-hdr-0.9.safetensors" | |
| print("=" * 80) | |
| print("Downloading LTX-2.3 distilled + spatial upsampler + Gemma + HDR IC-LoRA...") | |
| print("=" * 80) | |
| checkpoint_path = hf_hub_download(repo_id=LTX_MODEL_REPO, filename=DISTILLED_CHECKPOINT) | |
| spatial_upsampler_path = hf_hub_download(repo_id=LTX_MODEL_REPO, filename=SPATIAL_UPSCALER) | |
| hdr_lora_path = hf_hub_download(repo_id=HDR_LORA_REPO, filename=HDR_LORA_FILENAME) | |
| gemma_root = snapshot_download(repo_id=GEMMA_REPO) | |
| print(f"Checkpoint: {checkpoint_path}") | |
| print(f"Spatial upsampler: {spatial_upsampler_path}") | |
| print(f"HDR IC-LoRA: {hdr_lora_path}") | |
| print(f"Gemma root: {gemma_root}") | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Text encoding: on-the-fly Gemma -> (video_context, audio_context) for each | |
| # prompt. HDRICLoraPipeline expects a `.pt` path at __init__, so we bootstrap | |
| # one from an empty prompt, then overwrite `pipeline.text_embeddings` in | |
| # memory each generate call. | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| _DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
| _DTYPE = torch.bfloat16 | |
| prompt_encoder = PromptEncoder( | |
| checkpoint_path=checkpoint_path, | |
| gemma_root=gemma_root, | |
| dtype=_DTYPE, | |
| device=_DEVICE, | |
| ) | |
| def encode_prompt_to_contexts(prompt: str) -> tuple[torch.Tensor, torch.Tensor]: | |
| """Run Gemma + embeddings processor to produce (video_context, audio_context). | |
| HDRICLoraPipeline only consumes video_context; audio_context is stored for | |
| shape-compat with the `.pt` interface but ignored during HDR generation. | |
| MUST be called from inside a @spaces.GPU context on ZeroGPU. | |
| """ | |
| (out,) = prompt_encoder([prompt]) | |
| v = out.video_encoding | |
| a = out.audio_encoding if out.audio_encoding is not None else torch.zeros(0, device=v.device, dtype=v.dtype) | |
| return v, a | |
| # HDRICLoraPipeline.__init__ requires a .pt it can torch.load, but it only | |
| # stores the tensors β __call__ reads `self.text_embeddings` which we overwrite | |
| # on every generate run. So write a placeholder .pt at module-load (CPU, no | |
| # Gemma run β Gemma can only touch GPU inside a @spaces.GPU function on ZeroGPU). | |
| _bootstrap_emb_path = Path(tempfile.gettempdir()) / "ltx_hdr_bootstrap_emb.pt" | |
| _placeholder = torch.zeros(1, 1, 4096, dtype=_DTYPE) | |
| torch.save({"video_context": _placeholder, "audio_context": _placeholder}, _bootstrap_emb_path) | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Initialize pipeline | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # HDRICLoraPipeline is video-only (no audio path). HDR transform (LogC3) and | |
| # reference_downscale_factor are auto-detected from the LoRA metadata. | |
| pipeline = HDRICLoraPipeline( | |
| distilled_checkpoint_path=checkpoint_path, | |
| spatial_upsampler_path=spatial_upsampler_path, | |
| hdr_lora=hdr_lora_path, | |
| text_embeddings_path=str(_bootstrap_emb_path), | |
| quantization=QuantizationPolicy.fp8_cast(), | |
| offload_mode=OffloadMode.NONE, | |
| ) | |
| print(f"HDRICLoraPipeline ready. HDR transform: {pipeline.hdr_transform}, " | |
| f"ref_downscale={pipeline.reference_downscale_factor}") | |
| print("=" * 80) | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # UI helpers | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def detect_aspect_ratio(video_path) -> str: | |
| if video_path is None: | |
| return "16:9" | |
| try: | |
| meta = get_videostream_metadata(str(video_path)) | |
| ratio = meta.width / meta.height | |
| except Exception: | |
| return "16:9" | |
| candidates = { | |
| "16:9": 16 / 9, "9:16": 9 / 16, "1:1": 1.0, | |
| "4:3": 4 / 3, "3:4": 3 / 4, "21:9": 21 / 9, | |
| } | |
| return min(candidates, key=lambda k: abs(ratio - candidates[k])) | |
| def get_target_resolution(target_aspect: str, high_res: bool) -> tuple[int, int]: | |
| tier = "high" if high_res else "low" | |
| return RESOLUTIONS[tier].get(target_aspect, RESOLUTIONS[tier]["16:9"]) | |
| def on_video_upload(video): | |
| if video is None: | |
| return gr.update(), gr.update(), gr.update() | |
| try: | |
| meta = get_videostream_metadata(str(video)) | |
| dur = round(min(meta.frames / meta.fps, 6.7), 1) | |
| fps = float(meta.fps) | |
| except Exception: | |
| dur, fps = 3.0, 24.0 | |
| return gr.update(value=dur), gr.update(value=detect_aspect_ratio(video)), gr.update(value=fps) | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Generation | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def generate_video( | |
| input_video, | |
| prompt: str, | |
| duration: float, | |
| frame_rate: float, | |
| target_aspect: str, | |
| high_res: bool, | |
| seed: int, | |
| randomize_seed: bool, | |
| high_quality_hdr: bool, | |
| export_exr: bool, | |
| progress=gr.Progress(track_tqdm=True), | |
| ): | |
| current_seed = random.randint(0, MAX_SEED) if randomize_seed else int(seed) | |
| try: | |
| torch.cuda.reset_peak_memory_stats() | |
| if input_video is None: | |
| raise ValueError("Please upload a source video.") | |
| video_path = str(input_video) | |
| # Frame count must satisfy (n-1) % 8 == 0 | |
| num_frames = int(duration * frame_rate) + 1 | |
| num_frames = ((num_frames - 1 + 7) // 8) * 8 + 1 | |
| target_w, target_h = get_target_resolution(target_aspect, high_res) | |
| print(f"[HDR] {target_h}x{target_w}, frames={num_frames}, fps={frame_rate}, " | |
| f"seed={current_seed}, aspect={target_aspect}, hq_hdr={high_quality_hdr}") | |
| # Encode prompt -> (video_context, audio_context) and swap into the | |
| # pipeline. Gemma is loaded, used, and freed inside prompt_encoder. | |
| print(f"[HDR] Encoding prompt: {prompt!r}") | |
| video_context, audio_context = encode_prompt_to_contexts(prompt or "") | |
| pipeline.text_embeddings = (video_context, audio_context) | |
| # Tiling config: smaller spatial tile on lower-VRAM targets | |
| tiling_config = _make_tiling_config(spatial_tile=768 if not high_res else 1280) | |
| hdr_video = pipeline( | |
| seed=current_seed, | |
| height=int(target_h), | |
| width=int(target_w), | |
| num_frames=num_frames, | |
| frame_rate=float(frame_rate), | |
| video_conditioning=[(video_path, 1.0)], | |
| tiling_config=tiling_config, | |
| high_quality_hdr=high_quality_hdr, | |
| ) | |
| # The pipeline's internal resize_and_reflect_pad adds bottom/right | |
| # reflection when source aspect != target aspect. Its built-in crop | |
| # (_decode_video's `out[:, :crop_size[1], :crop_size[0], :]`) only | |
| # undoes the 64-divisor alignment padding β it leaves aspect-mismatch | |
| # reflection bands in the decoded output. Apply the same top-left | |
| # slice here with the un-reflected content region. | |
| try: | |
| src_meta = get_videostream_metadata(video_path) | |
| src_aspect = src_meta.width / src_meta.height | |
| tgt_aspect = target_w / target_h | |
| if src_aspect > tgt_aspect: | |
| content_h = int(round(target_w / src_aspect)) | |
| content_h -= content_h % 2 # libx264 yuv420p needs even dims | |
| if 0 < content_h < hdr_video.shape[1]: | |
| print(f"[HDR] Cropping reflected bottom: {hdr_video.shape[1]} -> {content_h}") | |
| hdr_video = hdr_video[:, :content_h, :, :] | |
| elif src_aspect < tgt_aspect: | |
| content_w = int(round(target_h * src_aspect)) | |
| content_w -= content_w % 2 # libx264 yuv420p needs even dims | |
| if 0 < content_w < hdr_video.shape[2]: | |
| print(f"[HDR] Cropping reflected right: {hdr_video.shape[2]} -> {content_w}") | |
| hdr_video = hdr_video[:, :, :content_w, :] | |
| except Exception as e: | |
| print(f"[HDR] Post-crop skipped: {type(e).__name__}: {e}") | |
| # hdr_video is [f, h, w, c] linear HDR float. Write EXR frames, then | |
| # tonemap to a libx264 mp4 for in-browser preview. | |
| work_dir = Path(tempfile.mkdtemp(prefix="ltx-hdr-")) | |
| exr_dir = work_dir / "exr" | |
| exr_dir.mkdir(parents=True, exist_ok=True) | |
| for j in range(hdr_video.shape[0]): | |
| save_exr_tensor( | |
| hdr_video[j].detach().cpu(), | |
| str(exr_dir / f"frame_{j:05d}.exr"), | |
| half=False, | |
| ) | |
| del hdr_video | |
| preview_mp4 = work_dir / "preview.mp4" | |
| encode_exr_sequence_to_mp4(exr_dir, preview_mp4, frame_rate=float(frame_rate)) | |
| exr_zip_path = None | |
| if export_exr: | |
| exr_zip_path = work_dir / "exr.zip" | |
| with zipfile.ZipFile(exr_zip_path, "w", compression=zipfile.ZIP_STORED) as zf: | |
| for exr in sorted(exr_dir.glob("frame_*.exr")): | |
| zf.write(exr, arcname=exr.name) | |
| return str(preview_mp4), (str(exr_zip_path) if exr_zip_path else None), current_seed | |
| except Exception as e: | |
| import traceback | |
| print(f"Error: {e}\n{traceback.format_exc()}") | |
| return None, None, current_seed | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Gradio UI β LTX 2.3 HDR | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| css = """ | |
| .generate-btn { min-height: 52px !important; font-size: 1.1em !important; } | |
| footer { display: none !important; } | |
| video { object-fit: contain !important; } | |
| """ | |
| theme = gr.themes.Citrus( | |
| primary_hue=gr.themes.colors.purple, | |
| secondary_hue=gr.themes.colors.purple, | |
| neutral_hue=gr.themes.colors.gray, | |
| ) | |
| with gr.Blocks(title="LTX 2.3 HDR", css=css, theme=theme) as demo: | |
| gr.Markdown(""" | |
| # LTX 2.3 HDR β¨ | |
| Reconstruct the highlights and shadows clipped away by the camera with LTX-2.3 HDR: | |
| Video-to-video HDR via LTX-2.3 + [HDR IC-LoRA](https://huggingface.co/diffusers-internal-dev/LTX-HDR-LoRA). | |
| Output is linear HDR (LogC3 inverse decoded). | |
| """) | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| input_video = gr.Video(label="Source Video") | |
| prompt = gr.Textbox( | |
| label="Prompt", | |
| info="Describe the scene being regenerated in HDR", | |
| lines=2, | |
| placeholder="a cinematic sunset over mountains, high dynamic range, bright sky, deep shadows", | |
| ) | |
| with gr.Row(): | |
| target_aspect = gr.Dropdown( | |
| label="Aspect Ratio", | |
| choices=["16:9", "9:16", "1:1", "4:3", "3:4", "21:9"], | |
| value="16:9", | |
| ) | |
| duration = gr.Slider( | |
| label="Duration (s)", minimum=1.0, maximum=6.7, value=3.0, step=0.1, | |
| ) | |
| frame_rate = gr.Number(label="FPS", value=24.0, precision=2) | |
| generate_btn = gr.Button( | |
| "Generate HDR", variant="primary", size="lg", elem_classes=["generate-btn"], | |
| ) | |
| with gr.Accordion("Advanced Settings", open=False): | |
| high_res = gr.Checkbox(label="High Resolution (2Γ)", value=False) | |
| high_quality_hdr = gr.Checkbox( | |
| label="High-Quality HDR", | |
| value=False, | |
| info="Generates at 2Γ internal frame count, keeps every other frame. ~2Γ slower, smoother temporally.", | |
| ) | |
| export_exr = gr.Checkbox( | |
| label="Export EXR frames (zip)", | |
| value=True, | |
| info="Per-frame linear OpenEXR (float32). Uncheck if you only want the mp4 preview.", | |
| ) | |
| seed = gr.Slider(label="Seed", minimum=0, maximum=MAX_SEED, value=10, step=1) | |
| randomize_seed = gr.Checkbox(label="Randomize Seed", value=True) | |
| with gr.Column(scale=1): | |
| output_video = gr.Video(label="Preview (sRGB tonemap)", autoplay=True, height=480) | |
| gr.Markdown("The preview mp4 is a fixed-EV sRGB tonemap; the EXR zip contains the full linear float frames for grading.") | |
| output_exr = gr.File(label="EXR frames (zip)") | |
| input_video.change( | |
| fn=on_video_upload, | |
| inputs=[input_video], | |
| outputs=[duration, target_aspect, frame_rate], | |
| ) | |
| generate_btn.click( | |
| fn=generate_video, | |
| inputs=[ | |
| input_video, prompt, duration, frame_rate, target_aspect, high_res, | |
| seed, randomize_seed, high_quality_hdr, export_exr, | |
| ], | |
| outputs=[output_video, output_exr, seed], | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() |