Spaces:
Sleeping
Sleeping
| """LTX-2.3 image-to-video Gradio Space (ZeroGPU). | |
| Upload an image + prompt -> short MP4 (with audio) generated by the LTX-2.3 22B | |
| distilled model via diffusers. Matches the model stack of the | |
| WhatDreamsCost "LTX Director 2" ComfyUI workflow, reimplemented natively. | |
| See docs/superpowers/specs/2026-06-25-ltx-image-to-video-space-design.md | |
| """ | |
| import random | |
| import tempfile | |
| import gradio as gr | |
| import spaces | |
| import torch | |
| from PIL import Image | |
| from diffusers import LTX2ImageToVideoPipeline | |
| # --- Generation constants (from the reference workflow + distilled recipe) --- | |
| MODEL_ID = "diffusers/LTX-2.3-Distilled-Diffusers" | |
| NUM_FRAMES = 121 # must be 8k + 1; ~5s at 24 fps | |
| FRAME_RATE = 24.0 | |
| NUM_STEPS = 8 # distilled | |
| GUIDANCE_SCALE = 1.0 # CFG = 1 for the distilled model | |
| BASE_LONG_SIDE = 704 # base-stage long edge (rounded to /32 per axis) | |
| GPU_DURATION = 120 # ZeroGPU seconds budget per call | |
| MAX_SEED = 2**32 - 1 | |
| # Optional default negative prompt shipped with the pipeline (best-effort). | |
| try: | |
| from diffusers.pipelines.ltx2.utils import DEFAULT_NEGATIVE_PROMPT | |
| except Exception: # pragma: no cover - depends on diffusers version | |
| DEFAULT_NEGATIVE_PROMPT = ( | |
| "worst quality, inconsistent motion, blurry, jittery, distorted" | |
| ) | |
| # Load the pipeline once at import, on CPU. ZeroGPU attaches the GPU only inside | |
| # the @spaces.GPU worker, so CUDA placement / offload is set up there. | |
| pipe = LTX2ImageToVideoPipeline.from_pretrained(MODEL_ID, torch_dtype=torch.bfloat16) | |
| _offload_ready = False | |
| def _target_size(image: Image.Image, long_side: int = BASE_LONG_SIDE): | |
| """Fit the image aspect ratio into `long_side`, each axis a multiple of 32.""" | |
| w, h = image.size | |
| ar = w / h if h else 1.0 | |
| if ar >= 1.0: | |
| tw, th = long_side, long_side / ar | |
| else: | |
| tw, th = long_side * ar, long_side | |
| tw = max(256, round(tw / 32) * 32) | |
| th = max(256, round(th / 32) * 32) | |
| return int(tw), int(th) | |
| def _normalize_output(result): | |
| """Return (frames, audio) regardless of pipeline return shape. | |
| The main LTX2Pipeline returns a (video, audio) tuple; the distilled card | |
| shows a `.frames[0]` object. Handle both. | |
| """ | |
| if isinstance(result, (tuple, list)) and len(result) == 2: | |
| video, audio = result | |
| # `video` may itself be a batch list of frame-lists. | |
| if video and isinstance(video[0], (list, tuple)): | |
| video = video[0] | |
| return video, audio | |
| frames = result.frames[0] | |
| audio = getattr(result, "audio", None) | |
| return frames, audio | |
| def _save_video(frames, audio, path: str): | |
| """Export frames (+ audio if available) to an MP4 at `path`.""" | |
| # Preferred: LTX-2.3 joint A/V exporter. | |
| if audio is not None: | |
| try: | |
| from diffusers.pipelines.ltx2.export_utils import encode_video | |
| encode_video(frames, audio, FRAME_RATE, path) | |
| return | |
| except Exception: | |
| pass # fall through to video-only export | |
| from diffusers.utils import export_to_video | |
| export_to_video(frames, path, fps=int(FRAME_RATE)) | |
| def _maybe_upscale(frames): | |
| """Best-effort 2x spatial upscale stage. | |
| The diffusers two-stage upscaler API for LTX-2.3 is not yet stable, so this | |
| is opt-in (default off) and degrades gracefully: if unavailable, the caller | |
| keeps the base-resolution frames and warns the user. | |
| """ | |
| from diffusers import LTXLatentUpsamplePipeline # raises if unavailable | |
| upsampler = LTXLatentUpsamplePipeline.from_pretrained( | |
| "Lightricks/LTX-2.3", subfolder="latent_upsampler", torch_dtype=torch.bfloat16 | |
| ) | |
| upsampler.to("cuda") | |
| return upsampler(frames).frames[0] | |
| def generate(image, prompt, upscale, progress=gr.Progress(track_tqdm=True)): | |
| global _offload_ready | |
| if image is None: | |
| raise gr.Error("Please upload an image first.") | |
| if not prompt or not prompt.strip(): | |
| raise gr.Error("Please enter a prompt describing the motion.") | |
| if not _offload_ready: | |
| pipe.enable_model_cpu_offload() | |
| _offload_ready = True | |
| if not isinstance(image, Image.Image): | |
| image = Image.fromarray(image) | |
| image = image.convert("RGB") | |
| width, height = _target_size(image) | |
| seed = random.randint(0, MAX_SEED) | |
| generator = torch.Generator(device="cuda").manual_seed(seed) | |
| try: | |
| result = pipe( | |
| image=image, | |
| prompt=prompt.strip(), | |
| negative_prompt=DEFAULT_NEGATIVE_PROMPT, | |
| width=width, | |
| height=height, | |
| num_frames=NUM_FRAMES, | |
| frame_rate=FRAME_RATE, | |
| num_inference_steps=NUM_STEPS, | |
| guidance_scale=GUIDANCE_SCALE, | |
| generator=generator, | |
| ) | |
| except torch.cuda.OutOfMemoryError as exc: # pragma: no cover | |
| torch.cuda.empty_cache() | |
| raise gr.Error("Ran out of GPU memory. Try a smaller image.") from exc | |
| frames, audio = _normalize_output(result) | |
| if upscale: | |
| try: | |
| frames = _maybe_upscale(frames) | |
| except Exception: | |
| gr.Warning( | |
| "2x upscale stage is unavailable in this build — " | |
| "returning base-resolution video." | |
| ) | |
| out_path = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False).name | |
| _save_video(frames, audio, out_path) | |
| return out_path | |
| with gr.Blocks(title="LTX-2.3 Image to Video") as demo: | |
| gr.Markdown( | |
| "# LTX-2.3 Image → Video\n" | |
| "Upload an image and describe the motion. Generates ~5s of video " | |
| "(with audio) using the LTX-2.3 22B distilled model." | |
| ) | |
| with gr.Row(): | |
| with gr.Column(): | |
| image_in = gr.Image(label="Input image", type="pil") | |
| prompt_in = gr.Textbox( | |
| label="Prompt", | |
| placeholder="A man plays a red electric guitar, camera slowly zooms in.", | |
| lines=3, | |
| ) | |
| upscale_in = gr.Checkbox( | |
| label="2× high-res upscale (slower, may exceed GPU time limit)", | |
| value=False, | |
| ) | |
| run_btn = gr.Button("Generate", variant="primary") | |
| with gr.Column(): | |
| video_out = gr.Video(label="Result") | |
| run_btn.click( | |
| fn=generate, | |
| inputs=[image_in, prompt_in, upscale_in], | |
| outputs=video_out, | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() | |