ltx-2-first-last-frame

Paused

App Files Files Community

linoyts HF Staff commited on Jan 6

Commit

c3f3413

verified ·

1 Parent(s): f48dcae

Update app.py

Browse files

Files changed (1) hide show

app.py +73 -13

app.py CHANGED Viewed

@@ -9,6 +9,9 @@ import numpy as np
 import random
 import spaces
 import gradio as gr
 from typing import Optional
 from huggingface_hub import hf_hub_download
 from ltx_pipelines.ti2vid_two_stages import TI2VidTwoStagesPipeline
@@ -33,11 +36,13 @@ DEFAULT_PROMPT = "An astronaut hatches from a fragile egg on the surface of the
 # HuggingFace Hub defaults
 DEFAULT_REPO_ID = "Lightricks/LTX-2"
-DEFAULT_GEMMA_REPO_ID = "google/gemma-3-12b-it-qat-q4_0-unquantized"
 DEFAULT_CHECKPOINT_FILENAME = "ltx-2-19b-dev-fp8.safetensors"
 DEFAULT_DISTILLED_LORA_FILENAME = "ltx-2-19b-distilled-lora-384.safetensors"
 DEFAULT_SPATIAL_UPSAMPLER_FILENAME = "ltx-2-spatial-upscaler-x2-1.0.safetensors"
 def get_hub_or_local_checkpoint(repo_id: Optional[str] = None, filename: Optional[str] = None):
     """Download from HuggingFace Hub or use local checkpoint."""
     if repo_id is None and filename is None:
@@ -68,24 +73,36 @@ print(f"Initializing pipeline with:")
 print(f"  checkpoint_path={checkpoint_path}")
 print(f"  distilled_lora_path={distilled_lora_path}")
 print(f"  spatial_upsampler_path={spatial_upsampler_path}")
-print(f"  gemma_root={DEFAULT_GEMMA_REPO_ID}")
 pipeline = TI2VidTwoStagesPipeline(
     checkpoint_path=checkpoint_path,
     distilled_lora_path=distilled_lora_path,
     distilled_lora_strength=DEFAULT_LORA_STRENGTH,
     spatial_upsampler_path=spatial_upsampler_path,
-    gemma_root=DEFAULT_GEMMA_REPO_ID,
     loras=[],
     fp8transformer=False,
     local_files_only=False
 )
 @spaces.GPU(duration=300)
 def generate_video(
     input_image,
     prompt: str,
     duration: float,
     negative_prompt: str = DEFAULT_NEGATIVE_PROMPT,
     seed: int = 42,
     randomize_seed: bool = True,
@@ -107,20 +124,56 @@ def generate_video(
         # Create output directory if it doesn't exist
         output_dir = Path("outputs")
         output_dir.mkdir(exist_ok=True)
-        output_path = output_dir / f"video_{seed}.mp4"
         # Handle image input
         images = []
         if input_image is not None:
             # Save uploaded image temporarily
-            temp_image_path = output_dir / f"temp_input_{seed}.jpg"
             if hasattr(input_image, 'save'):
                 input_image.save(temp_image_path)
             else:
                 # If it's a file path already
-                temp_image_path = input_image
             # Format: (image_path, frame_idx, strength)
             images = [(str(temp_image_path), 0, 1.0)]
         # Run inference - progress automatically tracks tqdm from pipeline
         pipeline(
@@ -136,6 +189,8 @@ def generate_video(
             cfg_guidance_scale=cfg_guidance_scale,
             images=images,
             tiling_config=TilingConfig.default(),
         )
         return str(output_path), current_seed
@@ -166,13 +221,18 @@ with gr.Blocks(title="LTX-2 Video 🎥🔈") as demo:
                 placeholder="Describe the motion and animation you want..."
             )
-            duration = gr.Slider(
-                label="Duration (seconds)",
-                minimum=1.0,
-                maximum=10.0,
-                value=3.0,
-                step=0.1
-            )
             generate_btn = gr.Button("Generate Video", variant="primary")

 import random
 import spaces
 import gradio as gr
+from gradio_client import Client, handle_file
+import torch
+from pathlib import Path
 from typing import Optional
 from huggingface_hub import hf_hub_download
 from ltx_pipelines.ti2vid_two_stages import TI2VidTwoStagesPipeline
 # HuggingFace Hub defaults
 DEFAULT_REPO_ID = "Lightricks/LTX-2"
 DEFAULT_CHECKPOINT_FILENAME = "ltx-2-19b-dev-fp8.safetensors"
 DEFAULT_DISTILLED_LORA_FILENAME = "ltx-2-19b-distilled-lora-384.safetensors"
 DEFAULT_SPATIAL_UPSAMPLER_FILENAME = "ltx-2-spatial-upscaler-x2-1.0.safetensors"
+# Text encoder space URL
+TEXT_ENCODER_SPACE = "linoyts/gemma-text-encoder"
 def get_hub_or_local_checkpoint(repo_id: Optional[str] = None, filename: Optional[str] = None):
     """Download from HuggingFace Hub or use local checkpoint."""
     if repo_id is None and filename is None:
 print(f"  checkpoint_path={checkpoint_path}")
 print(f"  distilled_lora_path={distilled_lora_path}")
 print(f"  spatial_upsampler_path={spatial_upsampler_path}")
+print(f"  text_encoder_space={TEXT_ENCODER_SPACE}")
+# Initialize pipeline WITHOUT text encoder (gemma_root=None)
+# Text encoding will be done by external space
 pipeline = TI2VidTwoStagesPipeline(
     checkpoint_path=checkpoint_path,
     distilled_lora_path=distilled_lora_path,
     distilled_lora_strength=DEFAULT_LORA_STRENGTH,
     spatial_upsampler_path=spatial_upsampler_path,
+    gemma_root=None,
     loras=[],
     fp8transformer=False,
     local_files_only=False
 )
+# Initialize text encoder client
+print(f"Connecting to text encoder space: {TEXT_ENCODER_SPACE}")
+try:
+    text_encoder_client = Client(TEXT_ENCODER_SPACE)
+    print("✓ Text encoder client connected!")
+except Exception as e:
+    print(f"⚠ Warning: Could not connect to text encoder space: {e}")
+    text_encoder_client = None
 @spaces.GPU(duration=300)
 def generate_video(
     input_image,
     prompt: str,
     duration: float,
+    enhance_prompt: bool = True,
     negative_prompt: str = DEFAULT_NEGATIVE_PROMPT,
     seed: int = 42,
     randomize_seed: bool = True,
         # Create output directory if it doesn't exist
         output_dir = Path("outputs")
         output_dir.mkdir(exist_ok=True)
+        output_path = output_dir / f"video_{current_seed}.mp4"
         # Handle image input
         images = []
+        temp_image_path = None  # Initialize to None
         if input_image is not None:
             # Save uploaded image temporarily
+            temp_image_path = output_dir / f"temp_input_{current_seed}.jpg"
             if hasattr(input_image, 'save'):
                 input_image.save(temp_image_path)
             else:
                 # If it's a file path already
+                temp_image_path = Path(input_image)
             # Format: (image_path, frame_idx, strength)
             images = [(str(temp_image_path), 0, 1.0)]
+        # Get embeddings from text encoder space
+        print(f"Encoding prompt: {prompt}")
+        if text_encoder_client is None:
+            raise RuntimeError(
+                f"Text encoder client not connected. Please ensure the text encoder space "
+                f"({TEXT_ENCODER_SPACE}) is running and accessible."
+            )
+        try:
+            # Prepare image for upload if it exists
+            image_input = None
+            if temp_image_path is not None:
+                image_input = handle_file(str(temp_image_path))
+            result = text_encoder_client.predict(
+                prompt=prompt,
+                enhance_prompt=enhance_prompt,
+                input_image=image_input,
+                seed=current_seed,
+                api_name="/encode_prompt"
+            )
+            embedding_path = result[0]  # Path to .pt file
+            print(f"Embeddings received from: {embedding_path}")
+            # Load embeddings
+            embeddings = torch.load(embedding_path)
+            video_context = embeddings['video_context']
+            audio_context = embeddings['audio_context']
+            print("✓ Embeddings loaded successfully")
+        except Exception as e:
+            raise RuntimeError(
+                f"Failed to get embeddings from text encoder space: {e}\n"
+                f"Please ensure {TEXT_ENCODER_SPACE} is running properly."
+            )
         # Run inference - progress automatically tracks tqdm from pipeline
         pipeline(
             cfg_guidance_scale=cfg_guidance_scale,
             images=images,
             tiling_config=TilingConfig.default(),
+            video_context=video_context,
+            audio_context=audio_context,
         )
         return str(output_path), current_seed
                 placeholder="Describe the motion and animation you want..."
             )
+            with gr.Row():
+                duration = gr.Slider(
+                    label="Duration (seconds)",
+                    minimum=1.0,
+                    maximum=10.0,
+                    value=3.0,
+                    step=0.1
+                )
+                enhance_prompt = gr.Checkbox(
+                        label="Enhance Prompt",
+                        value=True
+                    )
             generate_btn = gr.Button("Generate Video", variant="primary")