File size: 6,450 Bytes
e4e75f3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c6c0f0c
e4e75f3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c6c0f0c
e4e75f3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
"""LTX-2.3 image-to-video Gradio Space (ZeroGPU).

Upload an image + prompt -> short MP4 (with audio) generated by the LTX-2.3 22B
distilled model via diffusers. Matches the model stack of the
WhatDreamsCost "LTX Director 2" ComfyUI workflow, reimplemented natively.

See docs/superpowers/specs/2026-06-25-ltx-image-to-video-space-design.md
"""

import random
import tempfile

import gradio as gr
import spaces
import torch
from PIL import Image

from diffusers import LTX2ImageToVideoPipeline

# --- Generation constants (from the reference workflow + distilled recipe) ---
MODEL_ID = "diffusers/LTX-2.3-Distilled-Diffusers"
NUM_FRAMES = 121          # must be 8k + 1; ~5s at 24 fps
FRAME_RATE = 24.0
NUM_STEPS = 8             # distilled
GUIDANCE_SCALE = 1.0      # CFG = 1 for the distilled model
BASE_LONG_SIDE = 704      # base-stage long edge (rounded to /32 per axis)
GPU_DURATION = 120        # ZeroGPU seconds budget per call
MAX_SEED = 2**32 - 1

# Optional default negative prompt shipped with the pipeline (best-effort).
try:
    from diffusers.pipelines.ltx2.utils import DEFAULT_NEGATIVE_PROMPT
except Exception:  # pragma: no cover - depends on diffusers version
    DEFAULT_NEGATIVE_PROMPT = (
        "worst quality, inconsistent motion, blurry, jittery, distorted"
    )

# Load the pipeline once at import, on CPU. ZeroGPU attaches the GPU only inside
# the @spaces.GPU worker, so CUDA placement / offload is set up there.
pipe = LTX2ImageToVideoPipeline.from_pretrained(MODEL_ID, torch_dtype=torch.bfloat16)
_offload_ready = False


def _target_size(image: Image.Image, long_side: int = BASE_LONG_SIDE):
    """Fit the image aspect ratio into `long_side`, each axis a multiple of 32."""
    w, h = image.size
    ar = w / h if h else 1.0
    if ar >= 1.0:
        tw, th = long_side, long_side / ar
    else:
        tw, th = long_side * ar, long_side
    tw = max(256, round(tw / 32) * 32)
    th = max(256, round(th / 32) * 32)
    return int(tw), int(th)


def _normalize_output(result):
    """Return (frames, audio) regardless of pipeline return shape.

    The main LTX2Pipeline returns a (video, audio) tuple; the distilled card
    shows a `.frames[0]` object. Handle both.
    """
    if isinstance(result, (tuple, list)) and len(result) == 2:
        video, audio = result
        # `video` may itself be a batch list of frame-lists.
        if video and isinstance(video[0], (list, tuple)):
            video = video[0]
        return video, audio
    frames = result.frames[0]
    audio = getattr(result, "audio", None)
    return frames, audio


def _save_video(frames, audio, path: str):
    """Export frames (+ audio if available) to an MP4 at `path`."""
    # Preferred: LTX-2.3 joint A/V exporter.
    if audio is not None:
        try:
            from diffusers.pipelines.ltx2.export_utils import encode_video

            encode_video(frames, audio, FRAME_RATE, path)
            return
        except Exception:
            pass  # fall through to video-only export
    from diffusers.utils import export_to_video

    export_to_video(frames, path, fps=int(FRAME_RATE))


def _maybe_upscale(frames):
    """Best-effort 2x spatial upscale stage.

    The diffusers two-stage upscaler API for LTX-2.3 is not yet stable, so this
    is opt-in (default off) and degrades gracefully: if unavailable, the caller
    keeps the base-resolution frames and warns the user.
    """
    from diffusers import LTXLatentUpsamplePipeline  # raises if unavailable

    upsampler = LTXLatentUpsamplePipeline.from_pretrained(
        "Lightricks/LTX-2.3", subfolder="latent_upsampler", torch_dtype=torch.bfloat16
    )
    upsampler.to("cuda")
    return upsampler(frames).frames[0]


@spaces.GPU(duration=GPU_DURATION)
def generate(image, prompt, upscale, progress=gr.Progress(track_tqdm=True)):
    global _offload_ready
    if image is None:
        raise gr.Error("Please upload an image first.")
    if not prompt or not prompt.strip():
        raise gr.Error("Please enter a prompt describing the motion.")

    if not _offload_ready:
        pipe.enable_model_cpu_offload()
        _offload_ready = True

    if not isinstance(image, Image.Image):
        image = Image.fromarray(image)
    image = image.convert("RGB")
    width, height = _target_size(image)

    seed = random.randint(0, MAX_SEED)
    generator = torch.Generator(device="cuda").manual_seed(seed)

    try:
        result = pipe(
            image=image,
            prompt=prompt.strip(),
            negative_prompt=DEFAULT_NEGATIVE_PROMPT,
            width=width,
            height=height,
            num_frames=NUM_FRAMES,
            frame_rate=FRAME_RATE,
            num_inference_steps=NUM_STEPS,
            guidance_scale=GUIDANCE_SCALE,
            generator=generator,
        )
    except torch.cuda.OutOfMemoryError as exc:  # pragma: no cover
        torch.cuda.empty_cache()
        raise gr.Error("Ran out of GPU memory. Try a smaller image.") from exc

    frames, audio = _normalize_output(result)

    if upscale:
        try:
            frames = _maybe_upscale(frames)
        except Exception:
            gr.Warning(
                "2x upscale stage is unavailable in this build — "
                "returning base-resolution video."
            )

    out_path = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False).name
    _save_video(frames, audio, out_path)
    return out_path


with gr.Blocks(title="LTX-2.3 Image to Video") as demo:
    gr.Markdown(
        "# LTX-2.3 Image → Video\n"
        "Upload an image and describe the motion. Generates ~5s of video "
        "(with audio) using the LTX-2.3 22B distilled model."
    )
    with gr.Row():
        with gr.Column():
            image_in = gr.Image(label="Input image", type="pil")
            prompt_in = gr.Textbox(
                label="Prompt",
                placeholder="A man plays a red electric guitar, camera slowly zooms in.",
                lines=3,
            )
            upscale_in = gr.Checkbox(
                label="2× high-res upscale (slower, may exceed GPU time limit)",
                value=False,
            )
            run_btn = gr.Button("Generate", variant="primary")
        with gr.Column():
            video_out = gr.Video(label="Result")

    run_btn.click(
        fn=generate,
        inputs=[image_in, prompt_in, upscale_in],
        outputs=video_out,
    )


if __name__ == "__main__":
    demo.launch()