Spaces:

Lightricks
/

LTX-2-3

Running on Zero

App Files Files Community

multimodalart HF Staff commited on 21 days ago

Commit

aaa6ac1

verified ·

1 Parent(s): 8396596

Update app.py

Browse files

Files changed (1) hide show

app.py +122 -132

app.py CHANGED Viewed

@@ -6,6 +6,9 @@ import sys
 os.environ["TORCH_COMPILE_DISABLE"] = "1"
 os.environ["TORCHDYNAMO_DISABLE"] = "1"
 # Clone LTX-2 repo and install packages
 LTX_REPO_URL = "https://github.com/Lightricks/LTX-2.git"
 LTX_REPO_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "LTX-2")
@@ -37,17 +40,24 @@ torch._dynamo.config.disable = True
 import spaces
 import gradio as gr
 import numpy as np
-from gradio_client import Client, handle_file
-from huggingface_hub import hf_hub_download
 from ltx_core.model.video_vae import TilingConfig, get_video_chunks_number
 from ltx_core.quantization import QuantizationPolicy
-from ltx_core.text_encoders.gemma.embeddings_processor import EmbeddingsProcessorOutput
-import ltx_pipelines.distilled as distilled_module
 from ltx_pipelines.distilled import DistilledPipeline
 from ltx_pipelines.utils.args import ImageConditioningInput
 from ltx_pipelines.utils.media_io import encode_video
 logging.getLogger().setLevel(logging.INFO)
 MAX_SEED = np.iinfo(np.int32).max
@@ -55,68 +65,63 @@ DEFAULT_PROMPT = (
     "An astronaut hatches from a fragile egg on the surface of the Moon, "
     "the shell cracking and peeling apart in gentle low-gravity motion. "
     "Fine lunar dust lifts and drifts outward with each movement, floating "
-    "in slow arcs before settling back onto the ground. The astronaut pushes "
-    "free in a deliberate, weightless motion, small fragments of the egg "
-    "tumbling and spinning through the air."
 )
-DEFAULT_HEIGHT = 1024
-DEFAULT_WIDTH = 1536
 DEFAULT_FRAME_RATE = 24.0
-# Model repo
-LTX_MODEL_REPO = "diffusers-internal-dev/ltx-23"
-# Text encoder space URL - must be a 2.3-compatible text encoder
-TEXT_ENCODER_SPACE = "multimodalart/gemma-text-encoder-ltx23"
 # Download model checkpoints
 print("=" * 80)
-print("Downloading LTX-2.3 distilled model...")
 print("=" * 80)
 checkpoint_path = hf_hub_download(repo_id=LTX_MODEL_REPO, filename="ltx-2.3-22b-distilled.safetensors")
 spatial_upsampler_path = hf_hub_download(repo_id=LTX_MODEL_REPO, filename="ltx-2.3-spatial-upscaler-x2-1.0.safetensors")
 print(f"Checkpoint: {checkpoint_path}")
 print(f"Spatial upsampler: {spatial_upsampler_path}")
-# Initialize pipeline WITHOUT text encoder (gemma_root=None)
-# Text encoding will be done by external space
 pipeline = DistilledPipeline(
     distilled_checkpoint_path=checkpoint_path,
     spatial_upsampler_path=spatial_upsampler_path,
-    gemma_root=None,
     loras=[],
     quantization=QuantizationPolicy.fp8_cast(),
 )
-# Preload small models for ZeroGPU tensor packing.
-# DO NOT preload the transformer (~20GB) — the pipeline needs to load/unload
-# it between stages (FP8 upcast doubles it to ~44GB during forward pass).
-# Keeping it cached prevents cleanup_memory() from freeing it.
-print("Preloading small models...")
 ledger = pipeline.model_ledger
 _video_encoder = ledger.video_encoder()
 _video_decoder = ledger.video_decoder()
 _audio_decoder = ledger.audio_decoder()
 _vocoder = ledger.vocoder()
 _spatial_upsampler = ledger.spatial_upsampler()
 ledger.video_encoder = lambda: _video_encoder
 ledger.video_decoder = lambda: _video_decoder
 ledger.audio_decoder = lambda: _audio_decoder
 ledger.vocoder = lambda: _vocoder
 ledger.spatial_upsampler = lambda: _spatial_upsampler
-print("Small models preloaded! (transformer loads on demand per stage)")
-# Connect to text encoder space
-print(f"Connecting to text encoder space: {TEXT_ENCODER_SPACE}")
-try:
-    text_encoder_client = Client(TEXT_ENCODER_SPACE)
-    print("Text encoder client connected!")
-except Exception as e:
-    print(f"Warning: Could not connect to text encoder space: {e}")
-    text_encoder_client = None
 print("=" * 80)
 print("Pipeline ready!")
@@ -124,20 +129,46 @@ print("=" * 80)
 def log_memory(tag: str):
-    """Log GPU memory usage at a given point."""
     if torch.cuda.is_available():
         allocated = torch.cuda.memory_allocated() / 1024**3
-        reserved = torch.cuda.memory_reserved() / 1024**3
         peak = torch.cuda.max_memory_allocated() / 1024**3
         free, total = torch.cuda.mem_get_info()
-        free_gb = free / 1024**3
-        total_gb = total / 1024**3
-        print(f"[VRAM {tag}] allocated={allocated:.2f}GB reserved={reserved:.2f}GB peak={peak:.2f}GB free={free_gb:.2f}GB total={total_gb:.2f}GB")
     else:
-        print(f"[VRAM {tag}] CUDA not available")
-@spaces.GPU(duration=120, size="xlarge")
 def generate_video(
     input_image,
     prompt: str,
@@ -145,11 +176,10 @@ def generate_video(
     enhance_prompt: bool = True,
     seed: int = 42,
     randomize_seed: bool = True,
-    height: int = DEFAULT_HEIGHT,
-    width: int = DEFAULT_WIDTH,
     progress=gr.Progress(track_tqdm=True),
 ):
-    """Generate a video based on the given parameters."""
     try:
         torch.cuda.reset_peak_memory_stats()
         log_memory("start")
@@ -158,14 +188,11 @@ def generate_video(
         frame_rate = DEFAULT_FRAME_RATE
         num_frames = int(duration * frame_rate) + 1
-        # 8k+1 format
         num_frames = ((num_frames - 1 + 7) // 8) * 8 + 1
         print(f"Generating: {height}x{width}, {num_frames} frames ({duration}s), seed={current_seed}")
-        # Handle image input
         images = []
-        temp_image_path = None
         if input_image is not None:
             output_dir = Path("outputs")
             output_dir.mkdir(exist_ok=True)
@@ -176,103 +203,49 @@ def generate_video(
                 temp_image_path = Path(input_image)
             images = [ImageConditioningInput(path=str(temp_image_path), frame_idx=0, strength=1.0)]
-        # Get embeddings from text encoder space
-        print(f"Encoding prompt: {prompt}")
-        if text_encoder_client is None:
-            raise RuntimeError(
-                f"Text encoder client not connected. Please ensure the text encoder space "
-                f"({TEXT_ENCODER_SPACE}) is running and accessible."
-            )
-        try:
-            image_input = None
-            if temp_image_path is not None:
-                image_input = handle_file(str(temp_image_path))
-            result = text_encoder_client.predict(
-                prompt=prompt,
-                enhance_prompt=enhance_prompt,
-                input_image=image_input,
-                seed=current_seed,
-                negative_prompt="",
-                api_name="/encode_prompt",
-            )
-            embedding_path = result[0]
-            print(f"Embeddings received from: {embedding_path}")
-            embeddings = torch.load(embedding_path)
-            video_context = embeddings["video_context"].to("cuda")
-            audio_context = embeddings["audio_context"]
-            if audio_context is not None:
-                audio_context = audio_context.to("cuda")
-            print("Embeddings loaded successfully")
-            log_memory("after embeddings loaded")
-        except Exception as e:
-            raise RuntimeError(
-                f"Failed to get embeddings from text encoder space: {e}\n"
-                f"Please ensure {TEXT_ENCODER_SPACE} is running properly."
-            )
-        # Monkey-patch encode_prompts on the distilled module directly
-        # (it imports encode_prompts from helpers, so we must patch the local reference)
-        precomputed = EmbeddingsProcessorOutput(
-            video_encoding=video_context,
-            audio_encoding=audio_context,
-            attention_mask=torch.ones(1, device="cuda"),  # dummy mask
         )
-        original_encode_prompts = distilled_module.encode_prompts
-        distilled_module.encode_prompts = lambda *args, **kwargs: [precomputed]
-        try:
-            tiling_config = TilingConfig.default()
-            video_chunks_number = get_video_chunks_number(num_frames, tiling_config)
-            log_memory("before pipeline call")
-            video, audio = pipeline(
-                prompt=prompt,
-                seed=current_seed,
-                height=height,
-                width=width,
-                num_frames=num_frames,
-                frame_rate=frame_rate,
-                images=images,
-                tiling_config=tiling_config,
-                enhance_prompt=False,  # Already enhanced by text encoder space
-            )
-            log_memory("after pipeline call")
-            output_path = tempfile.mktemp(suffix=".mp4")
-            encode_video(
-                video=video,
-                fps=frame_rate,
-                audio=audio,
-                output_path=output_path,
-                video_chunks_number=video_chunks_number,
-            )
-            log_memory("after encode_video")
-            return str(output_path), current_seed
-        finally:
-            # Restore original encode_prompts
-            distilled_module.encode_prompts = original_encode_prompts
     except Exception as e:
         import traceback
         log_memory("on error")
-        error_msg = f"Error: {str(e)}\n{traceback.format_exc()}"
-        print(error_msg)
         return None, current_seed
 with gr.Blocks(title="LTX-2.3 Distilled") as demo:
     gr.Markdown("# LTX-2.3 Distilled (22B): Fast Audio-Video Generation")
     gr.Markdown(
-        "Fast video + audio generation using the distilled model (8 steps stage 1, 4 steps stage 2). "
-        "[[model]](https://huggingface.co/Lightricks/LTX-2) "
         "[[code]](https://github.com/Lightricks/LTX-2)"
     )
@@ -286,9 +259,12 @@ with gr.Blocks(title="LTX-2.3 Distilled") as demo:
                 lines=3,
                 placeholder="Describe the motion and animation you want...",
             )
             with gr.Row():
                 duration = gr.Slider(label="Duration (seconds)", minimum=1.0, maximum=10.0, value=3.0, step=0.1)
-                enhance_prompt = gr.Checkbox(label="Enhance Prompt", value=True)
             generate_btn = gr.Button("Generate Video", variant="primary", size="lg")
@@ -296,12 +272,26 @@ with gr.Blocks(title="LTX-2.3 Distilled") as demo:
                 seed = gr.Slider(label="Seed", minimum=0, maximum=MAX_SEED, value=10, step=1)
                 randomize_seed = gr.Checkbox(label="Randomize Seed", value=True)
                 with gr.Row():
-                    width = gr.Number(label="Width", value=DEFAULT_WIDTH, precision=0)
-                    height = gr.Number(label="Height", value=DEFAULT_HEIGHT, precision=0)
         with gr.Column():
             output_video = gr.Video(label="Generated Video", autoplay=True)
     generate_btn.click(
         fn=generate_video,
         inputs=[
@@ -313,7 +303,7 @@ with gr.Blocks(title="LTX-2.3 Distilled") as demo:
 css = """
-.gradio-container .contain{max-width: 1200px !important; margin: 0 auto !important}
 """
 if __name__ == "__main__":

 os.environ["TORCH_COMPILE_DISABLE"] = "1"
 os.environ["TORCHDYNAMO_DISABLE"] = "1"
+# Install xformers for memory-efficient attention
+subprocess.run([sys.executable, "-m", "pip", "install", "xformers==0.0.32.post2", "--no-build-isolation"], check=False)
 # Clone LTX-2 repo and install packages
 LTX_REPO_URL = "https://github.com/Lightricks/LTX-2.git"
 LTX_REPO_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "LTX-2")
 import spaces
 import gradio as gr
 import numpy as np
+from huggingface_hub import hf_hub_download, snapshot_download
 from ltx_core.model.video_vae import TilingConfig, get_video_chunks_number
 from ltx_core.quantization import QuantizationPolicy
 from ltx_pipelines.distilled import DistilledPipeline
 from ltx_pipelines.utils.args import ImageConditioningInput
 from ltx_pipelines.utils.media_io import encode_video
+# Force-patch xformers attention into the LTX attention module.
+from ltx_core.model.transformer import attention as _attn_mod
+print(f"[ATTN] Before patch: memory_efficient_attention={_attn_mod.memory_efficient_attention}")
+try:
+    from xformers.ops import memory_efficient_attention as _mea
+    _attn_mod.memory_efficient_attention = _mea
+    print(f"[ATTN] After patch: memory_efficient_attention={_attn_mod.memory_efficient_attention}")
+except Exception as e:
+    print(f"[ATTN] xformers patch FAILED: {type(e).__name__}: {e}")
 logging.getLogger().setLevel(logging.INFO)
 MAX_SEED = np.iinfo(np.int32).max
     "An astronaut hatches from a fragile egg on the surface of the Moon, "
     "the shell cracking and peeling apart in gentle low-gravity motion. "
     "Fine lunar dust lifts and drifts outward with each movement, floating "
+    "in slow arcs before settling back onto the ground."
 )
 DEFAULT_FRAME_RATE = 24.0
+# Resolution presets: (width, height)
+RESOLUTIONS = {
+    "high": {"16:9": (1536, 1024), "9:16": (1024, 1536), "1:1": (1024, 1024)},
+    "low": {"16:9": (768, 512), "9:16": (512, 768), "1:1": (768, 768)},
+}
+# Model repos
+LTX_MODEL_REPO = "diffusers-internal-dev/ltx-23"
+GEMMA_REPO = "google/gemma-3-12b-it-qat-q4_0-unquantized"
 # Download model checkpoints
 print("=" * 80)
+print("Downloading LTX-2.3 distilled model + Gemma...")
 print("=" * 80)
 checkpoint_path = hf_hub_download(repo_id=LTX_MODEL_REPO, filename="ltx-2.3-22b-distilled.safetensors")
 spatial_upsampler_path = hf_hub_download(repo_id=LTX_MODEL_REPO, filename="ltx-2.3-spatial-upscaler-x2-1.0.safetensors")
+gemma_root = snapshot_download(repo_id=GEMMA_REPO)
 print(f"Checkpoint: {checkpoint_path}")
 print(f"Spatial upsampler: {spatial_upsampler_path}")
+print(f"Gemma root: {gemma_root}")
+# Initialize pipeline WITH text encoder
 pipeline = DistilledPipeline(
     distilled_checkpoint_path=checkpoint_path,
     spatial_upsampler_path=spatial_upsampler_path,
+    gemma_root=gemma_root,
     loras=[],
     quantization=QuantizationPolicy.fp8_cast(),
 )
+# Preload all models for ZeroGPU tensor packing.
+print("Preloading all models (including Gemma)...")
 ledger = pipeline.model_ledger
+_transformer = ledger.transformer()
 _video_encoder = ledger.video_encoder()
 _video_decoder = ledger.video_decoder()
 _audio_decoder = ledger.audio_decoder()
 _vocoder = ledger.vocoder()
 _spatial_upsampler = ledger.spatial_upsampler()
+_text_encoder = ledger.text_encoder()
+_embeddings_processor = ledger.gemma_embeddings_processor()
+ledger.transformer = lambda: _transformer
 ledger.video_encoder = lambda: _video_encoder
 ledger.video_decoder = lambda: _video_decoder
 ledger.audio_decoder = lambda: _audio_decoder
 ledger.vocoder = lambda: _vocoder
 ledger.spatial_upsampler = lambda: _spatial_upsampler
+ledger.text_encoder = lambda: _text_encoder
+ledger.gemma_embeddings_processor = lambda: _embeddings_processor
+print("All models preloaded (including Gemma text encoder)!")
 print("=" * 80)
 print("Pipeline ready!")
 def log_memory(tag: str):
     if torch.cuda.is_available():
         allocated = torch.cuda.memory_allocated() / 1024**3
         peak = torch.cuda.max_memory_allocated() / 1024**3
         free, total = torch.cuda.mem_get_info()
+        print(f"[VRAM {tag}] allocated={allocated:.2f}GB peak={peak:.2f}GB free={free / 1024**3:.2f}GB total={total / 1024**3:.2f}GB")
+def detect_aspect_ratio(image) -> str:
+    """Detect the closest aspect ratio (16:9, 9:16, or 1:1) from an image."""
+    if image is None:
+        return "16:9"
+    if hasattr(image, "size"):
+        w, h = image.size
+    elif hasattr(image, "shape"):
+        h, w = image.shape[:2]
     else:
+        return "16:9"
+    ratio = w / h
+    candidates = {"16:9": 16 / 9, "9:16": 9 / 16, "1:1": 1.0}
+    return min(candidates, key=lambda k: abs(ratio - candidates[k]))
+def on_image_upload(image, high_res):
+    """Auto-set resolution when image is uploaded."""
+    aspect = detect_aspect_ratio(image)
+    tier = "high" if high_res else "low"
+    w, h = RESOLUTIONS[tier][aspect]
+    return gr.update(value=w), gr.update(value=h)
+def on_highres_toggle(image, high_res):
+    """Update resolution when high-res toggle changes."""
+    aspect = detect_aspect_ratio(image)
+    tier = "high" if high_res else "low"
+    w, h = RESOLUTIONS[tier][aspect]
+    return gr.update(value=w), gr.update(value=h)
+@spaces.GPU(duration=300, size="xlarge")
+@torch.inference_mode()
 def generate_video(
     input_image,
     prompt: str,
     enhance_prompt: bool = True,
     seed: int = 42,
     randomize_seed: bool = True,
+    height: int = 1024,
+    width: int = 1536,
     progress=gr.Progress(track_tqdm=True),
 ):
     try:
         torch.cuda.reset_peak_memory_stats()
         log_memory("start")
         frame_rate = DEFAULT_FRAME_RATE
         num_frames = int(duration * frame_rate) + 1
         num_frames = ((num_frames - 1 + 7) // 8) * 8 + 1
         print(f"Generating: {height}x{width}, {num_frames} frames ({duration}s), seed={current_seed}")
         images = []
         if input_image is not None:
             output_dir = Path("outputs")
             output_dir.mkdir(exist_ok=True)
                 temp_image_path = Path(input_image)
             images = [ImageConditioningInput(path=str(temp_image_path), frame_idx=0, strength=1.0)]
+        tiling_config = TilingConfig.default()
+        video_chunks_number = get_video_chunks_number(num_frames, tiling_config)
+        log_memory("before pipeline call")
+        video, audio = pipeline(
+            prompt=prompt,
+            seed=current_seed,
+            height=int(height),
+            width=int(width),
+            num_frames=num_frames,
+            frame_rate=frame_rate,
+            images=images,
+            tiling_config=tiling_config,
+            enhance_prompt=enhance_prompt,
         )
+        log_memory("after pipeline call")
+        output_path = tempfile.mktemp(suffix=".mp4")
+        encode_video(
+            video=video,
+            fps=frame_rate,
+            audio=audio,
+            output_path=output_path,
+            video_chunks_number=video_chunks_number,
+        )
+        log_memory("after encode_video")
+        return str(output_path), current_seed
     except Exception as e:
         import traceback
         log_memory("on error")
+        print(f"Error: {str(e)}\n{traceback.format_exc()}")
         return None, current_seed
 with gr.Blocks(title="LTX-2.3 Distilled") as demo:
     gr.Markdown("# LTX-2.3 Distilled (22B): Fast Audio-Video Generation")
     gr.Markdown(
+        "Fast and high quality video + audio generation"
+        "[[model]](https://huggingface.co/Lightricks/LTX-2.3) "
         "[[code]](https://github.com/Lightricks/LTX-2)"
     )
                 lines=3,
                 placeholder="Describe the motion and animation you want...",
             )
             with gr.Row():
                 duration = gr.Slider(label="Duration (seconds)", minimum=1.0, maximum=10.0, value=3.0, step=0.1)
+                with gr.Column():
+                    enhance_prompt = gr.Checkbox(label="Enhance Prompt", value=False)
+                    high_res = gr.Checkbox(label="High Resolution", value=True)
             generate_btn = gr.Button("Generate Video", variant="primary", size="lg")
                 seed = gr.Slider(label="Seed", minimum=0, maximum=MAX_SEED, value=10, step=1)
                 randomize_seed = gr.Checkbox(label="Randomize Seed", value=True)
                 with gr.Row():
+                    width = gr.Number(label="Width", value=1536, precision=0)
+                    height = gr.Number(label="Height", value=1024, precision=0)
         with gr.Column():
             output_video = gr.Video(label="Generated Video", autoplay=True)
+    # Auto-detect aspect ratio from uploaded image and set resolution
+    input_image.change(
+        fn=on_image_upload,
+        inputs=[input_image, high_res],
+        outputs=[width, height],
+    )
+    # Update resolution when high-res toggle changes
+    high_res.change(
+        fn=on_highres_toggle,
+        inputs=[input_image, high_res],
+        outputs=[width, height],
+    )
     generate_btn.click(
         fn=generate_video,
         inputs=[
 css = """
+.fillable{max-width: 1200px !important}
 """
 if __name__ == "__main__":