ltx-m-v2 / app.py
romanbwrk's picture
x
c6c0f0c
Raw
History Blame Contribute Delete
6.45 kB
"""LTX-2.3 image-to-video Gradio Space (ZeroGPU).
Upload an image + prompt -> short MP4 (with audio) generated by the LTX-2.3 22B
distilled model via diffusers. Matches the model stack of the
WhatDreamsCost "LTX Director 2" ComfyUI workflow, reimplemented natively.
See docs/superpowers/specs/2026-06-25-ltx-image-to-video-space-design.md
"""
import random
import tempfile
import gradio as gr
import spaces
import torch
from PIL import Image
from diffusers import LTX2ImageToVideoPipeline
# --- Generation constants (from the reference workflow + distilled recipe) ---
MODEL_ID = "diffusers/LTX-2.3-Distilled-Diffusers"
NUM_FRAMES = 121 # must be 8k + 1; ~5s at 24 fps
FRAME_RATE = 24.0
NUM_STEPS = 8 # distilled
GUIDANCE_SCALE = 1.0 # CFG = 1 for the distilled model
BASE_LONG_SIDE = 704 # base-stage long edge (rounded to /32 per axis)
GPU_DURATION = 120 # ZeroGPU seconds budget per call
MAX_SEED = 2**32 - 1
# Optional default negative prompt shipped with the pipeline (best-effort).
try:
from diffusers.pipelines.ltx2.utils import DEFAULT_NEGATIVE_PROMPT
except Exception: # pragma: no cover - depends on diffusers version
DEFAULT_NEGATIVE_PROMPT = (
"worst quality, inconsistent motion, blurry, jittery, distorted"
)
# Load the pipeline once at import, on CPU. ZeroGPU attaches the GPU only inside
# the @spaces.GPU worker, so CUDA placement / offload is set up there.
pipe = LTX2ImageToVideoPipeline.from_pretrained(MODEL_ID, torch_dtype=torch.bfloat16)
_offload_ready = False
def _target_size(image: Image.Image, long_side: int = BASE_LONG_SIDE):
"""Fit the image aspect ratio into `long_side`, each axis a multiple of 32."""
w, h = image.size
ar = w / h if h else 1.0
if ar >= 1.0:
tw, th = long_side, long_side / ar
else:
tw, th = long_side * ar, long_side
tw = max(256, round(tw / 32) * 32)
th = max(256, round(th / 32) * 32)
return int(tw), int(th)
def _normalize_output(result):
"""Return (frames, audio) regardless of pipeline return shape.
The main LTX2Pipeline returns a (video, audio) tuple; the distilled card
shows a `.frames[0]` object. Handle both.
"""
if isinstance(result, (tuple, list)) and len(result) == 2:
video, audio = result
# `video` may itself be a batch list of frame-lists.
if video and isinstance(video[0], (list, tuple)):
video = video[0]
return video, audio
frames = result.frames[0]
audio = getattr(result, "audio", None)
return frames, audio
def _save_video(frames, audio, path: str):
"""Export frames (+ audio if available) to an MP4 at `path`."""
# Preferred: LTX-2.3 joint A/V exporter.
if audio is not None:
try:
from diffusers.pipelines.ltx2.export_utils import encode_video
encode_video(frames, audio, FRAME_RATE, path)
return
except Exception:
pass # fall through to video-only export
from diffusers.utils import export_to_video
export_to_video(frames, path, fps=int(FRAME_RATE))
def _maybe_upscale(frames):
"""Best-effort 2x spatial upscale stage.
The diffusers two-stage upscaler API for LTX-2.3 is not yet stable, so this
is opt-in (default off) and degrades gracefully: if unavailable, the caller
keeps the base-resolution frames and warns the user.
"""
from diffusers import LTXLatentUpsamplePipeline # raises if unavailable
upsampler = LTXLatentUpsamplePipeline.from_pretrained(
"Lightricks/LTX-2.3", subfolder="latent_upsampler", torch_dtype=torch.bfloat16
)
upsampler.to("cuda")
return upsampler(frames).frames[0]
@spaces.GPU(duration=GPU_DURATION)
def generate(image, prompt, upscale, progress=gr.Progress(track_tqdm=True)):
global _offload_ready
if image is None:
raise gr.Error("Please upload an image first.")
if not prompt or not prompt.strip():
raise gr.Error("Please enter a prompt describing the motion.")
if not _offload_ready:
pipe.enable_model_cpu_offload()
_offload_ready = True
if not isinstance(image, Image.Image):
image = Image.fromarray(image)
image = image.convert("RGB")
width, height = _target_size(image)
seed = random.randint(0, MAX_SEED)
generator = torch.Generator(device="cuda").manual_seed(seed)
try:
result = pipe(
image=image,
prompt=prompt.strip(),
negative_prompt=DEFAULT_NEGATIVE_PROMPT,
width=width,
height=height,
num_frames=NUM_FRAMES,
frame_rate=FRAME_RATE,
num_inference_steps=NUM_STEPS,
guidance_scale=GUIDANCE_SCALE,
generator=generator,
)
except torch.cuda.OutOfMemoryError as exc: # pragma: no cover
torch.cuda.empty_cache()
raise gr.Error("Ran out of GPU memory. Try a smaller image.") from exc
frames, audio = _normalize_output(result)
if upscale:
try:
frames = _maybe_upscale(frames)
except Exception:
gr.Warning(
"2x upscale stage is unavailable in this build — "
"returning base-resolution video."
)
out_path = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False).name
_save_video(frames, audio, out_path)
return out_path
with gr.Blocks(title="LTX-2.3 Image to Video") as demo:
gr.Markdown(
"# LTX-2.3 Image → Video\n"
"Upload an image and describe the motion. Generates ~5s of video "
"(with audio) using the LTX-2.3 22B distilled model."
)
with gr.Row():
with gr.Column():
image_in = gr.Image(label="Input image", type="pil")
prompt_in = gr.Textbox(
label="Prompt",
placeholder="A man plays a red electric guitar, camera slowly zooms in.",
lines=3,
)
upscale_in = gr.Checkbox(
label="2× high-res upscale (slower, may exceed GPU time limit)",
value=False,
)
run_btn = gr.Button("Generate", variant="primary")
with gr.Column():
video_out = gr.Video(label="Result")
run_btn.click(
fn=generate,
inputs=[image_in, prompt_in, upscale_in],
outputs=video_out,
)
if __name__ == "__main__":
demo.launch()