"""LTX-2.3 image-to-video Gradio Space (ZeroGPU). Upload an image + prompt -> short MP4 (with audio) generated by the LTX-2.3 22B distilled model via diffusers. Matches the model stack of the WhatDreamsCost "LTX Director 2" ComfyUI workflow, reimplemented natively. See docs/superpowers/specs/2026-06-25-ltx-image-to-video-space-design.md """ import random import tempfile import gradio as gr import spaces import torch from PIL import Image from diffusers import LTX2ImageToVideoPipeline # --- Generation constants (from the reference workflow + distilled recipe) --- MODEL_ID = "diffusers/LTX-2.3-Distilled-Diffusers" NUM_FRAMES = 121 # must be 8k + 1; ~5s at 24 fps FRAME_RATE = 24.0 NUM_STEPS = 8 # distilled GUIDANCE_SCALE = 1.0 # CFG = 1 for the distilled model BASE_LONG_SIDE = 704 # base-stage long edge (rounded to /32 per axis) GPU_DURATION = 120 # ZeroGPU seconds budget per call MAX_SEED = 2**32 - 1 # Optional default negative prompt shipped with the pipeline (best-effort). try: from diffusers.pipelines.ltx2.utils import DEFAULT_NEGATIVE_PROMPT except Exception: # pragma: no cover - depends on diffusers version DEFAULT_NEGATIVE_PROMPT = ( "worst quality, inconsistent motion, blurry, jittery, distorted" ) # Load the pipeline once at import, on CPU. ZeroGPU attaches the GPU only inside # the @spaces.GPU worker, so CUDA placement / offload is set up there. pipe = LTX2ImageToVideoPipeline.from_pretrained(MODEL_ID, torch_dtype=torch.bfloat16) _offload_ready = False def _target_size(image: Image.Image, long_side: int = BASE_LONG_SIDE): """Fit the image aspect ratio into `long_side`, each axis a multiple of 32.""" w, h = image.size ar = w / h if h else 1.0 if ar >= 1.0: tw, th = long_side, long_side / ar else: tw, th = long_side * ar, long_side tw = max(256, round(tw / 32) * 32) th = max(256, round(th / 32) * 32) return int(tw), int(th) def _normalize_output(result): """Return (frames, audio) regardless of pipeline return shape. The main LTX2Pipeline returns a (video, audio) tuple; the distilled card shows a `.frames[0]` object. Handle both. """ if isinstance(result, (tuple, list)) and len(result) == 2: video, audio = result # `video` may itself be a batch list of frame-lists. if video and isinstance(video[0], (list, tuple)): video = video[0] return video, audio frames = result.frames[0] audio = getattr(result, "audio", None) return frames, audio def _save_video(frames, audio, path: str): """Export frames (+ audio if available) to an MP4 at `path`.""" # Preferred: LTX-2.3 joint A/V exporter. if audio is not None: try: from diffusers.pipelines.ltx2.export_utils import encode_video encode_video(frames, audio, FRAME_RATE, path) return except Exception: pass # fall through to video-only export from diffusers.utils import export_to_video export_to_video(frames, path, fps=int(FRAME_RATE)) def _maybe_upscale(frames): """Best-effort 2x spatial upscale stage. The diffusers two-stage upscaler API for LTX-2.3 is not yet stable, so this is opt-in (default off) and degrades gracefully: if unavailable, the caller keeps the base-resolution frames and warns the user. """ from diffusers import LTXLatentUpsamplePipeline # raises if unavailable upsampler = LTXLatentUpsamplePipeline.from_pretrained( "Lightricks/LTX-2.3", subfolder="latent_upsampler", torch_dtype=torch.bfloat16 ) upsampler.to("cuda") return upsampler(frames).frames[0] @spaces.GPU(duration=GPU_DURATION) def generate(image, prompt, upscale, progress=gr.Progress(track_tqdm=True)): global _offload_ready if image is None: raise gr.Error("Please upload an image first.") if not prompt or not prompt.strip(): raise gr.Error("Please enter a prompt describing the motion.") if not _offload_ready: pipe.enable_model_cpu_offload() _offload_ready = True if not isinstance(image, Image.Image): image = Image.fromarray(image) image = image.convert("RGB") width, height = _target_size(image) seed = random.randint(0, MAX_SEED) generator = torch.Generator(device="cuda").manual_seed(seed) try: result = pipe( image=image, prompt=prompt.strip(), negative_prompt=DEFAULT_NEGATIVE_PROMPT, width=width, height=height, num_frames=NUM_FRAMES, frame_rate=FRAME_RATE, num_inference_steps=NUM_STEPS, guidance_scale=GUIDANCE_SCALE, generator=generator, ) except torch.cuda.OutOfMemoryError as exc: # pragma: no cover torch.cuda.empty_cache() raise gr.Error("Ran out of GPU memory. Try a smaller image.") from exc frames, audio = _normalize_output(result) if upscale: try: frames = _maybe_upscale(frames) except Exception: gr.Warning( "2x upscale stage is unavailable in this build — " "returning base-resolution video." ) out_path = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False).name _save_video(frames, audio, out_path) return out_path with gr.Blocks(title="LTX-2.3 Image to Video") as demo: gr.Markdown( "# LTX-2.3 Image → Video\n" "Upload an image and describe the motion. Generates ~5s of video " "(with audio) using the LTX-2.3 22B distilled model." ) with gr.Row(): with gr.Column(): image_in = gr.Image(label="Input image", type="pil") prompt_in = gr.Textbox( label="Prompt", placeholder="A man plays a red electric guitar, camera slowly zooms in.", lines=3, ) upscale_in = gr.Checkbox( label="2× high-res upscale (slower, may exceed GPU time limit)", value=False, ) run_btn = gr.Button("Generate", variant="primary") with gr.Column(): video_out = gr.Video(label="Result") run_btn.click( fn=generate, inputs=[image_in, prompt_in, upscale_in], outputs=video_out, ) if __name__ == "__main__": demo.launch()