Spaces:
Sleeping
Sleeping
File size: 6,450 Bytes
e4e75f3 c6c0f0c e4e75f3 c6c0f0c e4e75f3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 | """LTX-2.3 image-to-video Gradio Space (ZeroGPU).
Upload an image + prompt -> short MP4 (with audio) generated by the LTX-2.3 22B
distilled model via diffusers. Matches the model stack of the
WhatDreamsCost "LTX Director 2" ComfyUI workflow, reimplemented natively.
See docs/superpowers/specs/2026-06-25-ltx-image-to-video-space-design.md
"""
import random
import tempfile
import gradio as gr
import spaces
import torch
from PIL import Image
from diffusers import LTX2ImageToVideoPipeline
# --- Generation constants (from the reference workflow + distilled recipe) ---
MODEL_ID = "diffusers/LTX-2.3-Distilled-Diffusers"
NUM_FRAMES = 121 # must be 8k + 1; ~5s at 24 fps
FRAME_RATE = 24.0
NUM_STEPS = 8 # distilled
GUIDANCE_SCALE = 1.0 # CFG = 1 for the distilled model
BASE_LONG_SIDE = 704 # base-stage long edge (rounded to /32 per axis)
GPU_DURATION = 120 # ZeroGPU seconds budget per call
MAX_SEED = 2**32 - 1
# Optional default negative prompt shipped with the pipeline (best-effort).
try:
from diffusers.pipelines.ltx2.utils import DEFAULT_NEGATIVE_PROMPT
except Exception: # pragma: no cover - depends on diffusers version
DEFAULT_NEGATIVE_PROMPT = (
"worst quality, inconsistent motion, blurry, jittery, distorted"
)
# Load the pipeline once at import, on CPU. ZeroGPU attaches the GPU only inside
# the @spaces.GPU worker, so CUDA placement / offload is set up there.
pipe = LTX2ImageToVideoPipeline.from_pretrained(MODEL_ID, torch_dtype=torch.bfloat16)
_offload_ready = False
def _target_size(image: Image.Image, long_side: int = BASE_LONG_SIDE):
"""Fit the image aspect ratio into `long_side`, each axis a multiple of 32."""
w, h = image.size
ar = w / h if h else 1.0
if ar >= 1.0:
tw, th = long_side, long_side / ar
else:
tw, th = long_side * ar, long_side
tw = max(256, round(tw / 32) * 32)
th = max(256, round(th / 32) * 32)
return int(tw), int(th)
def _normalize_output(result):
"""Return (frames, audio) regardless of pipeline return shape.
The main LTX2Pipeline returns a (video, audio) tuple; the distilled card
shows a `.frames[0]` object. Handle both.
"""
if isinstance(result, (tuple, list)) and len(result) == 2:
video, audio = result
# `video` may itself be a batch list of frame-lists.
if video and isinstance(video[0], (list, tuple)):
video = video[0]
return video, audio
frames = result.frames[0]
audio = getattr(result, "audio", None)
return frames, audio
def _save_video(frames, audio, path: str):
"""Export frames (+ audio if available) to an MP4 at `path`."""
# Preferred: LTX-2.3 joint A/V exporter.
if audio is not None:
try:
from diffusers.pipelines.ltx2.export_utils import encode_video
encode_video(frames, audio, FRAME_RATE, path)
return
except Exception:
pass # fall through to video-only export
from diffusers.utils import export_to_video
export_to_video(frames, path, fps=int(FRAME_RATE))
def _maybe_upscale(frames):
"""Best-effort 2x spatial upscale stage.
The diffusers two-stage upscaler API for LTX-2.3 is not yet stable, so this
is opt-in (default off) and degrades gracefully: if unavailable, the caller
keeps the base-resolution frames and warns the user.
"""
from diffusers import LTXLatentUpsamplePipeline # raises if unavailable
upsampler = LTXLatentUpsamplePipeline.from_pretrained(
"Lightricks/LTX-2.3", subfolder="latent_upsampler", torch_dtype=torch.bfloat16
)
upsampler.to("cuda")
return upsampler(frames).frames[0]
@spaces.GPU(duration=GPU_DURATION)
def generate(image, prompt, upscale, progress=gr.Progress(track_tqdm=True)):
global _offload_ready
if image is None:
raise gr.Error("Please upload an image first.")
if not prompt or not prompt.strip():
raise gr.Error("Please enter a prompt describing the motion.")
if not _offload_ready:
pipe.enable_model_cpu_offload()
_offload_ready = True
if not isinstance(image, Image.Image):
image = Image.fromarray(image)
image = image.convert("RGB")
width, height = _target_size(image)
seed = random.randint(0, MAX_SEED)
generator = torch.Generator(device="cuda").manual_seed(seed)
try:
result = pipe(
image=image,
prompt=prompt.strip(),
negative_prompt=DEFAULT_NEGATIVE_PROMPT,
width=width,
height=height,
num_frames=NUM_FRAMES,
frame_rate=FRAME_RATE,
num_inference_steps=NUM_STEPS,
guidance_scale=GUIDANCE_SCALE,
generator=generator,
)
except torch.cuda.OutOfMemoryError as exc: # pragma: no cover
torch.cuda.empty_cache()
raise gr.Error("Ran out of GPU memory. Try a smaller image.") from exc
frames, audio = _normalize_output(result)
if upscale:
try:
frames = _maybe_upscale(frames)
except Exception:
gr.Warning(
"2x upscale stage is unavailable in this build — "
"returning base-resolution video."
)
out_path = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False).name
_save_video(frames, audio, out_path)
return out_path
with gr.Blocks(title="LTX-2.3 Image to Video") as demo:
gr.Markdown(
"# LTX-2.3 Image → Video\n"
"Upload an image and describe the motion. Generates ~5s of video "
"(with audio) using the LTX-2.3 22B distilled model."
)
with gr.Row():
with gr.Column():
image_in = gr.Image(label="Input image", type="pil")
prompt_in = gr.Textbox(
label="Prompt",
placeholder="A man plays a red electric guitar, camera slowly zooms in.",
lines=3,
)
upscale_in = gr.Checkbox(
label="2× high-res upscale (slower, may exceed GPU time limit)",
value=False,
)
run_btn = gr.Button("Generate", variant="primary")
with gr.Column():
video_out = gr.Video(label="Result")
run_btn.click(
fn=generate,
inputs=[image_in, prompt_in, upscale_in],
outputs=video_out,
)
if __name__ == "__main__":
demo.launch()
|