Spaces:

DevXCoder2025
/

ltx-m-v2

Sleeping

File size: 6,450 Bytes

"""LTX-2.3 image-to-video Gradio Space (ZeroGPU).

Upload an image + prompt -> short MP4 (with audio) generated by the LTX-2.3 22B
distilled model via diffusers. Matches the model stack of the
WhatDreamsCost "LTX Director 2" ComfyUI workflow, reimplemented natively.

See docs/superpowers/specs/2026-06-25-ltx-image-to-video-space-design.md
"""

import random
import tempfile

import gradio as gr
import spaces
import torch
from PIL import Image

from diffusers import LTX2ImageToVideoPipeline

# --- Generation constants (from the reference workflow + distilled recipe) ---
MODEL_ID = "diffusers/LTX-2.3-Distilled-Diffusers"
NUM_FRAMES = 121          # must be 8k + 1; ~5s at 24 fps
FRAME_RATE = 24.0
NUM_STEPS = 8             # distilled
GUIDANCE_SCALE = 1.0      # CFG = 1 for the distilled model
BASE_LONG_SIDE = 704      # base-stage long edge (rounded to /32 per axis)
GPU_DURATION = 120        # ZeroGPU seconds budget per call
MAX_SEED = 2**32 - 1

# Optional default negative prompt shipped with the pipeline (best-effort).
try:
    from diffusers.pipelines.ltx2.utils import DEFAULT_NEGATIVE_PROMPT
except Exception:  # pragma: no cover - depends on diffusers version
    DEFAULT_NEGATIVE_PROMPT = (
        "worst quality, inconsistent motion, blurry, jittery, distorted"
    )

# Load the pipeline once at import, on CPU. ZeroGPU attaches the GPU only inside
# the @spaces.GPU worker, so CUDA placement / offload is set up there.
pipe = LTX2ImageToVideoPipeline.from_pretrained(MODEL_ID, torch_dtype=torch.bfloat16)
_offload_ready = False


def _target_size(image: Image.Image, long_side: int = BASE_LONG_SIDE):
    """Fit the image aspect ratio into `long_side`, each axis a multiple of 32."""
    w, h = image.size
    ar = w / h if h else 1.0
    if ar >= 1.0:
        tw, th = long_side, long_side / ar
    else:
        tw, th = long_side * ar, long_side
    tw = max(256, round(tw / 32) * 32)
    th = max(256, round(th / 32) * 32)
    return int(tw), int(th)


def _normalize_output(result):
    """Return (frames, audio) regardless of pipeline return shape.

    The main LTX2Pipeline returns a (video, audio) tuple; the distilled card
    shows a `.frames[0]` object. Handle both.
    """
    if isinstance(result, (tuple, list)) and len(result) == 2:
        video, audio = result
        # `video` may itself be a batch list of frame-lists.
        if video and isinstance(video[0], (list, tuple)):
            video = video[0]
        return video, audio
    frames = result.frames[0]
    audio = getattr(result, "audio", None)
    return frames, audio


def _save_video(frames, audio, path: str):
    """Export frames (+ audio if available) to an MP4 at `path`."""
    # Preferred: LTX-2.3 joint A/V exporter.
    if audio is not None:
        try:
            from diffusers.pipelines.ltx2.export_utils import encode_video

            encode_video(frames, audio, FRAME_RATE, path)
            return
        except Exception:
            pass  # fall through to video-only export
    from diffusers.utils import export_to_video

    export_to_video(frames, path, fps=int(FRAME_RATE))


def _maybe_upscale(frames):
    """Best-effort 2x spatial upscale stage.

    The diffusers two-stage upscaler API for LTX-2.3 is not yet stable, so this
    is opt-in (default off) and degrades gracefully: if unavailable, the caller
    keeps the base-resolution frames and warns the user.
    """
    from diffusers import LTXLatentUpsamplePipeline  # raises if unavailable

    upsampler = LTXLatentUpsamplePipeline.from_pretrained(
        "Lightricks/LTX-2.3", subfolder="latent_upsampler", torch_dtype=torch.bfloat16
    )
    upsampler.to("cuda")
    return upsampler(frames).frames[0]


@spaces.GPU(duration=GPU_DURATION)
def generate(image, prompt, upscale, progress=gr.Progress(track_tqdm=True)):
    global _offload_ready
    if image is None:
        raise gr.Error("Please upload an image first.")
    if not prompt or not prompt.strip():
        raise gr.Error("Please enter a prompt describing the motion.")

    if not _offload_ready:
        pipe.enable_model_cpu_offload()
        _offload_ready = True

    if not isinstance(image, Image.Image):
        image = Image.fromarray(image)
    image = image.convert("RGB")
    width, height = _target_size(image)

    seed = random.randint(0, MAX_SEED)
    generator = torch.Generator(device="cuda").manual_seed(seed)

    try:
        result = pipe(
            image=image,
            prompt=prompt.strip(),
            negative_prompt=DEFAULT_NEGATIVE_PROMPT,
            width=width,
            height=height,
            num_frames=NUM_FRAMES,
            frame_rate=FRAME_RATE,
            num_inference_steps=NUM_STEPS,
            guidance_scale=GUIDANCE_SCALE,
            generator=generator,
        )
    except torch.cuda.OutOfMemoryError as exc:  # pragma: no cover
        torch.cuda.empty_cache()
        raise gr.Error("Ran out of GPU memory. Try a smaller image.") from exc

    frames, audio = _normalize_output(result)

    if upscale:
        try:
            frames = _maybe_upscale(frames)
        except Exception:
            gr.Warning(
                "2x upscale stage is unavailable in this build — "
                "returning base-resolution video."
            )

    out_path = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False).name
    _save_video(frames, audio, out_path)
    return out_path


with gr.Blocks(title="LTX-2.3 Image to Video") as demo:
    gr.Markdown(
        "# LTX-2.3 Image → Video\n"
        "Upload an image and describe the motion. Generates ~5s of video "
        "(with audio) using the LTX-2.3 22B distilled model."
    )
    with gr.Row():
        with gr.Column():
            image_in = gr.Image(label="Input image", type="pil")
            prompt_in = gr.Textbox(
                label="Prompt",
                placeholder="A man plays a red electric guitar, camera slowly zooms in.",
                lines=3,
            )
            upscale_in = gr.Checkbox(
                label="2× high-res upscale (slower, may exceed GPU time limit)",
                value=False,
            )
            run_btn = gr.Button("Generate", variant="primary")
        with gr.Column():
            video_out = gr.Video(label="Result")

    run_btn.click(
        fn=generate,
        inputs=[image_in, prompt_in, upscale_in],
        outputs=video_out,
    )


if __name__ == "__main__":
    demo.launch()