""" LTX-2.3 Video Generator — Gradio Space Text-to-Video & Image-to-Video using Lightricks LTX-2.3 distilled checkpoint. """ import os import sys import subprocess import tempfile import time from pathlib import Path from typing import Optional import spaces import torch import numpy as np import gradio as gr from PIL import Image # ── Setup: install Lightricks LTX-2 packages ─────────────────────────── LTX_REPO = "https://github.com/Lightricks/LTX-2.git" PACKAGES_DIR = Path("/tmp/ltx-packages") PACKAGES_DIR.mkdir(parents=True, exist_ok=True) def ensure_packages(): """Clone LTX-2 repo and install packages if not already done.""" marker = PACKAGES_DIR / ".installed" if marker.exists(): return print("[setup] Installing LTX-2 packages...") subprocess.run( ["git", "clone", "--depth", "1", LTX_REPO, str(PACKAGES_DIR / "repo")], check=True, capture_output=True ) for pkg in ["packages/ltx-core", "packages/ltx-pipelines"]: pkg_path = PACKAGES_DIR / "repo" / pkg if pkg_path.exists(): subprocess.run( [sys.executable, "-m", "pip", "install", "-e", str(pkg_path)], check=True, capture_output=True ) marker.touch() print("[setup] Packages ready") ensure_packages() # ── Imports (after packages) ──────────────────────────────────────────── from huggingface_hub import hf_hub_download from ltx_pipelines.distilled import DistilledPipeline from ltx_pipelines.utils.constants import DISTILLED_SIGMA_VALUES, STAGE_2_DISTILLED_SIGMA_VALUES # ── Constants ─────────────────────────────────────────────────────────── MODEL_ID = "Lightricks/LTX-2.3" DISTILLED_CKPT = "ltx-2.3-22b-distilled.safetensors" SPATIAL_UPSCALER = "ltx-2.3-spatial-upscaler-x2-1.1.safetensors" GEMMA_ID = "google/gemma-3-12b-it-qat-q4_0-unquantized" CACHE_ROOT = Path("/tmp/ltx-cache") DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") DTYPE = torch.bfloat16 _pipe = None def download_checkpoint(repo_id: str, filename: str) -> str: """Download a checkpoint from Hugging Face Hub.""" return hf_hub_download( repo_id=repo_id, filename=filename, cache_dir=str(CACHE_ROOT / "hub"), ) @spaces.GPU def load_pipeline(): """Load or return cached pipeline.""" global _pipe if _pipe is not None: return _pipe print(f"[load] Device: {DEVICE} | torch: {torch.__version__}") # Download checkpoints ckpt_path = download_checkpoint(MODEL_ID, DISTILLED_CKPT) upscaler_path = download_checkpoint(MODEL_ID, SPATIAL_UPSCALER) gemma_path = download_checkpoint(GEMMA_ID, "model.safetensors") gemma_root = str(Path(gemma_path).parent) print(f"[load] Checkpoint: {ckpt_path}") print(f"[load] Gemma root: {gemma_root}") _pipe = DistilledPipeline( checkpoint_path=ckpt_path, gemma_root=gemma_root, spatial_upsampler_path=upscaler_path, loras=[], device=DEVICE, fp8transformer=True, ) print("[load] Pipeline ready") return _pipe @spaces.GPU def generate_video( prompt: str, image: Optional[np.ndarray] = None, negative_prompt: str = "", num_frames: int = 49, width: int = 768, height: int = 512, guidance_scale: float = 1.0, num_inference_steps: int = 8, seed: int = -1, ) -> str: """Generate video from text + optional image.""" pipe = load_pipeline() if seed < 0: seed = torch.randint(0, 2**31, (1,)).item() generator = torch.Generator(device=DEVICE).manual_seed(seed) # Process optional input image cond_images = None if image is not None: pil = Image.fromarray(image).resize((width, height)) cond_images = [pil] print(f"[gen] {prompt[:60]}... | {num_frames}f | seed={seed}") # Build kwargs kwargs = dict( prompt=prompt, height=height, width=width, num_frames=num_frames, num_inference_steps=num_inference_steps, guidance_scale=guidance_scale, generator=generator, output_type="pt", condition_images=cond_images, ) with torch.inference_mode(): frames = pipe(**kwargs) out_video = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False).name # Export frames to video from diffusers.utils import export_to_video export_to_video(frames, out_video, fps=16) print(f"[gen] ✅ {out_video}") return out_video # ── Gradio UI ─────────────────────────────────────────────────────────── with gr.Blocks(theme=gr.themes.Soft(), title="LTX-2.3 Video Generator") as demo: gr.Markdown( """ # 🎬 LTX‑2.3 Video Generator **Lightricks LTX‑2.3** — 22B audio‑video foundation model Uses the distilled checkpoint (8‑step turbo) for fast generation on ZeroGPU. """ ) with gr.Row(): with gr.Column(scale=1): prompt = gr.Textbox( label="Prompt", placeholder="A cinematic drone shot over misty mountains at sunrise...", lines=3, ) with gr.Accordion("⚙️ Settings", open=False): with gr.Row(): num_frames = gr.Slider(9, 97, value=49, step=8, label="Frames") steps = gr.Slider(4, 24, value=8, step=1, label="Steps") with gr.Row(): guidance = gr.Slider(0.5, 3.0, value=1.0, step=0.1, label="CFG Scale") seed = gr.Number(value=-1, label="Seed (-1 = random)", precision=0) with gr.Row(): width = gr.Dropdown([512, 576, 640, 768, 832, 896, 1024], value=768, label="Width") height = gr.Dropdown([384, 448, 512, 576, 640, 704, 768], value=512, label="Height") generate_btn = gr.Button("🚀 Generate", variant="primary", size="lg") with gr.Column(scale=1): image_input = gr.Image(label="Input Image (optional)", type="numpy") video_output = gr.Video(label="Generated Video", autoplay=True) with gr.Row(): gr.Examples( examples=[ ["Cinematic drone shot over a misty mountain range at sunrise, golden light piercing clouds"], ["Fluffy Samoyed puppy playing in a tulip field, slow motion, shallow depth of field"], ["Cyberpunk city street at night, neon signs reflecting on wet pavement, cinematic lighting"], ["Majestic humpback whale breaching at sunset, slow motion, National Geographic style"], ["Time-lapse of cherry blossoms blooming in a Japanese garden, peaceful atmosphere"], ], inputs=[prompt], ) generate_btn.click( fn=generate_video, inputs=[prompt, image_input, num_frames, width, height, guidance, steps, seed], outputs=video_output, ) prompt.submit( fn=generate_video, inputs=[prompt, image_input, num_frames, width, height, guidance, steps, seed], outputs=video_output, ) if __name__ == "__main__": demo.launch(server_name="0.0.0.0", server_port=7860)