Spaces:

DevXCoder2025
/

ltx-m-v2

Sleeping

c6c0f0c 6 days ago

6.45 kB

	"""LTX-2.3 image-to-video Gradio Space (ZeroGPU).

	Upload an image + prompt -> short MP4 (with audio) generated by the LTX-2.3 22B
	distilled model via diffusers. Matches the model stack of the
	WhatDreamsCost "LTX Director 2" ComfyUI workflow, reimplemented natively.

	See docs/superpowers/specs/2026-06-25-ltx-image-to-video-space-design.md
	"""

	import random
	import tempfile

	import gradio as gr
	import spaces
	import torch
	from PIL import Image

	from diffusers import LTX2ImageToVideoPipeline

	# --- Generation constants (from the reference workflow + distilled recipe) ---
	MODEL_ID = "diffusers/LTX-2.3-Distilled-Diffusers"
	NUM_FRAMES = 121 # must be 8k + 1; ~5s at 24 fps
	FRAME_RATE = 24.0
	NUM_STEPS = 8 # distilled
	GUIDANCE_SCALE = 1.0 # CFG = 1 for the distilled model
	BASE_LONG_SIDE = 704 # base-stage long edge (rounded to /32 per axis)
	GPU_DURATION = 120 # ZeroGPU seconds budget per call
	MAX_SEED = 2**32 - 1

	# Optional default negative prompt shipped with the pipeline (best-effort).
	try:
	from diffusers.pipelines.ltx2.utils import DEFAULT_NEGATIVE_PROMPT
	except Exception: # pragma: no cover - depends on diffusers version
	DEFAULT_NEGATIVE_PROMPT = (
	"worst quality, inconsistent motion, blurry, jittery, distorted"
	)

	# Load the pipeline once at import, on CPU. ZeroGPU attaches the GPU only inside
	# the @spaces.GPU worker, so CUDA placement / offload is set up there.
	pipe = LTX2ImageToVideoPipeline.from_pretrained(MODEL_ID, torch_dtype=torch.bfloat16)
	_offload_ready = False


	def _target_size(image: Image.Image, long_side: int = BASE_LONG_SIDE):
	"""Fit the image aspect ratio into `long_side`, each axis a multiple of 32."""
	w, h = image.size
	ar = w / h if h else 1.0
	if ar >= 1.0:
	tw, th = long_side, long_side / ar
	else:
	tw, th = long_side * ar, long_side
	tw = max(256, round(tw / 32) * 32)
	th = max(256, round(th / 32) * 32)
	return int(tw), int(th)


	def _normalize_output(result):
	"""Return (frames, audio) regardless of pipeline return shape.

	The main LTX2Pipeline returns a (video, audio) tuple; the distilled card
	shows a `.frames[0]` object. Handle both.
	"""
	if isinstance(result, (tuple, list)) and len(result) == 2:
	video, audio = result
	# `video` may itself be a batch list of frame-lists.
	if video and isinstance(video[0], (list, tuple)):
	video = video[0]
	return video, audio
	frames = result.frames[0]
	audio = getattr(result, "audio", None)
	return frames, audio


	def _save_video(frames, audio, path: str):
	"""Export frames (+ audio if available) to an MP4 at `path`."""
	# Preferred: LTX-2.3 joint A/V exporter.
	if audio is not None:
	try:
	from diffusers.pipelines.ltx2.export_utils import encode_video

	encode_video(frames, audio, FRAME_RATE, path)
	return
	except Exception:
	pass # fall through to video-only export
	from diffusers.utils import export_to_video

	export_to_video(frames, path, fps=int(FRAME_RATE))


	def _maybe_upscale(frames):
	"""Best-effort 2x spatial upscale stage.

	The diffusers two-stage upscaler API for LTX-2.3 is not yet stable, so this
	is opt-in (default off) and degrades gracefully: if unavailable, the caller
	keeps the base-resolution frames and warns the user.
	"""
	from diffusers import LTXLatentUpsamplePipeline # raises if unavailable

	upsampler = LTXLatentUpsamplePipeline.from_pretrained(
	"Lightricks/LTX-2.3", subfolder="latent_upsampler", torch_dtype=torch.bfloat16
	)
	upsampler.to("cuda")
	return upsampler(frames).frames[0]


	@spaces.GPU(duration=GPU_DURATION)
	def generate(image, prompt, upscale, progress=gr.Progress(track_tqdm=True)):
	global _offload_ready
	if image is None:
	raise gr.Error("Please upload an image first.")
	if not prompt or not prompt.strip():
	raise gr.Error("Please enter a prompt describing the motion.")

	if not _offload_ready:
	pipe.enable_model_cpu_offload()
	_offload_ready = True

	if not isinstance(image, Image.Image):
	image = Image.fromarray(image)
	image = image.convert("RGB")
	width, height = _target_size(image)

	seed = random.randint(0, MAX_SEED)
	generator = torch.Generator(device="cuda").manual_seed(seed)

	try:
	result = pipe(
	image=image,
	prompt=prompt.strip(),
	negative_prompt=DEFAULT_NEGATIVE_PROMPT,
	width=width,
	height=height,
	num_frames=NUM_FRAMES,
	frame_rate=FRAME_RATE,
	num_inference_steps=NUM_STEPS,
	guidance_scale=GUIDANCE_SCALE,
	generator=generator,
	)
	except torch.cuda.OutOfMemoryError as exc: # pragma: no cover
	torch.cuda.empty_cache()
	raise gr.Error("Ran out of GPU memory. Try a smaller image.") from exc

	frames, audio = _normalize_output(result)

	if upscale:
	try:
	frames = _maybe_upscale(frames)
	except Exception:
	gr.Warning(
	"2x upscale stage is unavailable in this build — "
	"returning base-resolution video."
	)

	out_path = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False).name
	_save_video(frames, audio, out_path)
	return out_path


	with gr.Blocks(title="LTX-2.3 Image to Video") as demo:
	gr.Markdown(
	"# LTX-2.3 Image → Video\n"
	"Upload an image and describe the motion. Generates ~5s of video "
	"(with audio) using the LTX-2.3 22B distilled model."
	)
	with gr.Row():
	with gr.Column():
	image_in = gr.Image(label="Input image", type="pil")
	prompt_in = gr.Textbox(
	label="Prompt",
	placeholder="A man plays a red electric guitar, camera slowly zooms in.",
	lines=3,
	)
	upscale_in = gr.Checkbox(
	label="2× high-res upscale (slower, may exceed GPU time limit)",
	value=False,
	)
	run_btn = gr.Button("Generate", variant="primary")
	with gr.Column():
	video_out = gr.Video(label="Result")

	run_btn.click(
	fn=generate,
	inputs=[image_in, prompt_in, upscale_in],
	outputs=video_out,
	)


	if __name__ == "__main__":
	demo.launch()